In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
customers = pd.read_csv("https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE")
products = pd.read_csv("https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0")
transactions = pd.read_csv("https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF")

In [3]:

customers["SignupDate"] = pd.to_datetime(customers["SignupDate"])
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"])

merged_data = transactions.merge(customers, on="CustomerID", how="left")
merged_data = merged_data.merge(products, on="ProductID", how="left")
merged_data["Year"] = merged_data["TransactionDate"].dt.year
merged_data["Month"] = merged_data["TransactionDate"].dt.month
merged_data["SignupYear"] = merged_data["SignupDate"].dt.year
merged_data["AverageOrderValue"] = merged_data["TotalValue"] / merged_data["Quantity"]

print("\nMerged Data Preview:")
print(merged_data.head())


Merged Data Preview:
  TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0        T00001      C0199      P067 2024-08-25 12:38:23         1   
1        T00112      C0146      P067 2024-05-27 22:23:54         1   
2        T00166      C0127      P067 2024-04-25 07:38:55         1   
3        T00272      C0087      P067 2024-03-26 22:55:37         2   
4        T00363      C0070      P067 2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe 2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia 2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe 2024-04-04   
3      601.36   300.68  Travis Campbell  South America 2024-04-11   
4      902.04   300.68    Timothy Perez         Europe 2022-03-15   

                       ProductName     Category  Price_y  Year  Month  \
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  2024      8

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [5]:
#customer features
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "Quantity": "sum",
    "AverageOrderValue": "mean",
    "Category": lambda x: x.mode()[0] if len(x.mode()) > 0 else None
}).reset_index()


customer_features = pd.get_dummies(customer_features, columns=["Category"], drop_first=True)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

# Calculate similarity using cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Find top 3 similar customers for CustomerID C0001 to C0020
lookalike_results = {}
customer_ids = customer_features["CustomerID"].tolist()

for idx, customer_id in enumerate(customer_ids[:20]):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_ids[i], score) for i, score in similarity_scores[1:4]]  # Exclude self-match
    lookalike_results[customer_id] = top_3

lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_results.keys(),
    "Lookalikes": [str(v) for v in lookalike_results.values()]
})
lookalike_df.to_csv("Prakhar_Jaiswal_Lookalike.csv", index=False)

print("\nLookalike Results for First 20 Customers:")
print(lookalike_df.head(20))



Lookalike Results for First 20 Customers:
   CustomerID                                         Lookalikes
0       C0001  [('C0069', 0.9793686796401265), ('C0154', 0.97...
1       C0002  [('C0029', 0.9998668269037813), ('C0088', 0.99...
2       C0003  [('C0038', 0.997287341359637), ('C0160', 0.971...
3       C0004  [('C0075', 0.9899044859613366), ('C0165', 0.98...
4       C0005  [('C0192', 0.9974346831629687), ('C0140', 0.99...
5       C0006  [('C0187', 0.9517660052079885), ('C0117', 0.94...
6       C0007  [('C0146', 0.9973951578946069), ('C0115', 0.97...
7       C0008  [('C0113', 0.9896572363853194), ('C0136', 0.98...
8       C0009  [('C0150', 0.9974912743459965), ('C0061', 0.96...
9       C0010  [('C0176', 0.9788396644427603), ('C0144', 0.97...
10      C0011  [('C0139', 0.9912275180493367), ('C0064', 0.98...
11      C0012  [('C0182', 0.9916958094005028), ('C0163', 0.98...
12      C0013  [('C0099', 0.9845345921152222), ('C0145', 0.97...
13      C0014  [('C0097', 0.9978978523383679), 