In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [3]:
customers = pd.read_csv(r"C:\Users\ADMIN\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\ADMIN\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Users\ADMIN\Downloads\Transactions.csv")

In [4]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [5]:
transactions = transactions.merge(customers, on="CustomerID", how="left")
transactions = transactions.merge(products, on="ProductID", how="left")

In [6]:
customer_features = transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    avg_spent=('TotalValue', 'mean'),
    total_transactions=('TransactionID', 'count'),
    unique_categories=('Category', 'nunique')
).reset_index()

In [7]:
customer_profiles = customers.merge(customer_features, on="CustomerID", how="left").fillna(0)

In [8]:
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'], drop_first=True)

In [9]:
scaler = StandardScaler()
feature_cols = ['total_spent', 'avg_spent', 'total_transactions', 'unique_categories'] + \
               [col for col in customer_profiles.columns if col.startswith("Region_")]
scaled_features = scaler.fit_transform(customer_profiles[feature_cols])

In [10]:
similarity_matrix = cosine_similarity(scaled_features)

In [12]:
lookalikes = {}
customer_ids = customer_profiles['CustomerID'].values

for idx, cust_id in enumerate(customer_ids[:20]):  # First 20 customers (C0001 - C0020)
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]
    lookalikes[cust_id] = [(customer_ids[i], round(score, 4)) for i, score in sorted_scores]

In [13]:
lookalike_df = pd.DataFrame([{'CustomerID': cust, 'Lookalikes': str(lookalikes[cust])} 
                             for cust in lookalikes])
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv has been generated!")

Lookalike.csv has been generated!
