In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
transactions_merged = pd.merge(transactions, products, on='ProductID', how='left')

In [4]:
transaction_features = transactions_merged.groupby('CustomerID').agg(
    total_transactions=('TransactionID', 'count'),
    total_spent=('TotalValue', 'sum'),
    avg_quantity=('Quantity', 'mean'),
    unique_categories=('Category', 'nunique')
).reset_index()

In [5]:
customers_all = pd.merge(customers, transaction_features, on='CustomerID', how='left')
customers_all.fillna({
    'total_transactions': 0,
    'total_spent': 0,
    'avg_quantity': 0,
    'unique_categories': 0
}, inplace=True)

In [6]:
latest_date = pd.to_datetime(transactions['TransactionDate']).max()
customers_all['SignupDate'] = pd.to_datetime(customers_all['SignupDate'])
customers_all['tenure_days'] = (latest_date - customers_all['SignupDate']).dt.days

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), ['Region']),
        ('scaler', StandardScaler(), ['tenure_days', 'total_transactions', 'total_spent', 'avg_quantity', 'unique_categories'])
    ]
)
features = preprocessor.fit_transform(customers_all)

In [9]:
similarity_matrix = cosine_similarity(features)


In [10]:
lookalike_map = {}
target_customers = [f'C00{i:02d}' for i in range(1, 21)]

In [11]:
for cust_id in target_customers:
    if cust_id not in customers_all['CustomerID'].values:
        lookalike_map[cust_id] = []
        continue
    
    cust_index = customers_all[customers_all['CustomerID'] == cust_id].index[0]
    sim_scores = list(enumerate(similarity_matrix[cust_index]))

In [14]:
sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]  # Skip the first (self)
top_3 = sorted_scores[:3]

In [22]:
recommendations = []
for idx, score in top_3:
    similar_cust_id = customers_all.iloc[idx]['CustomerID']
recommendations.append((similar_cust_id, round(score, 4)))
    
lookalike_map[cust_id] = recommendations

In [25]:
with open('Lookalike.csv', 'w') as f:
    f.write("CustomerID,SimilarCustomers\n")
    for cust_id, similar in lookalike_map.items():
        similar_str = ','.join([f"{cid}:{score}" for cid, score in similar])
        f.write(f"{cust_id},{similar_str}\n")

CustomerID,SimilarCustomers
C0001,C0012:0.921,C0004:0.879,C0018:0.856
C0002,C0007:0.934,C0015:0.902,C0020:0.891
...