In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")
df = transactions.merge(customers, on="CustomerID", how="left")
df = df.merge(products, on="ProductID", how="left")
df = df.rename(columns={"Price_x": "Price", "Price_y": "ProductPrice"})

In [8]:
cust_profiles = df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price': 'mean',
    'Region': 'first',
    'Category': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'
}).reset_index()

cust_profiles = pd.get_dummies(cust_profiles, columns=['Region', 'Category'])

In [9]:
features = cust_profiles.drop(['CustomerID'], axis=1)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [10]:
similarity_matrix = cosine_similarity(features_scaled)


In [11]:
lookalikes = {}
for i, cust_id in enumerate(cust_profiles['CustomerID']):
    if cust_id.startswith("C00") and int(cust_id[1:]) <= 20:
        sims = list(enumerate(similarity_matrix[i]))
        sims = sorted(sims, key=lambda x: x[1], reverse=True)
        top3 = [(cust_profiles.loc[j, 'CustomerID'], round(score, 4)) for j, score in sims[1:4]]
        lookalikes[cust_id] = top3


In [12]:
lookalike_df = pd.DataFrame([
    {"CustomerID": k, "Lookalikes": v} for k, v in lookalikes.items()
])
lookalike_df.to_csv("Subhasmita_Khuntia_Lookalike.csv", index=False)