In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customer_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

In [3]:
# Preprocessing and Feature Engineering
# Merge transactions dataset with customers dataset and products dataset
merged_df = transactions_df.merge(customer_df, on="CustomerID").merge(products_df, on="ProductID")

In [4]:
# Customer-level feature engineering
customer_features = merged_df.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "TransactionID": "count",
    "ProductID": lambda x: x.nunique(),  # Number of unique products purchased
    "Category": lambda x: x.mode()[0],  # Most common category purchased
    "TransactionDate": "max",  # Recency of last transaction
}).rename(columns={
    "TotalValue": "TotalRevenue",
    "TransactionID": "NumTransactions",
    "ProductID": "UniqueProducts",
    "TransactionDate": "LastTransactionDate"
}).reset_index()

In [5]:
# Add demographic data (region, signup date)
customer_features = customer_features.merge(customer_df[["CustomerID", "Region", "SignupDate"]], on="CustomerID")

# Normalize numerical features
scaler = StandardScaler()
numeric_cols = ["TotalRevenue", "NumTransactions", "UniqueProducts"]
customer_features[numeric_cols] = scaler.fit_transform(customer_features[numeric_cols])

# Encode categorical features (Region, Category)
customer_features = pd.get_dummies(customer_features, columns=["Region", "Category"], drop_first=True)

customer_features.head()

Unnamed: 0,CustomerID,TotalRevenue,NumTransactions,UniqueProducts,LastTransactionDate,SignupDate,Region_Europe,Region_North America,Region_South America,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,-0.061701,-0.011458,0.050047,2024-11-02 17:04:16,2022-07-10,False,False,True,False,True,False
1,C0002,-0.877744,-0.467494,-0.424204,2024-12-03 01:41:41,2022-02-13,False,False,False,True,False,False
2,C0003,-0.405857,-0.467494,-0.424204,2024-08-24 18:54:04,2024-03-07,False,False,True,False,False,True
3,C0004,1.032547,1.35665,1.472798,2024-12-23 14:13:52,2022-10-09,False,False,True,False,False,False
4,C0005,-0.783929,-0.92353,-0.898455,2024-11-04 00:30:22,2022-08-15,False,False,False,False,True,False


In [6]:
# Similarity Calculation
# Compute cosine similarity between customers (Euclidean Distance is alo one method to find similarities)
customer_matrix = customer_features.drop(columns=["CustomerID", "LastTransactionDate", "SignupDate"])
similarity_matrix = cosine_similarity(customer_matrix)

# Recommend top 3 lookalikes for each customer
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])
lookalikes = {}

for cust_id in customer_features["CustomerID"][:20]:  # First 20 customers
    similar_customers = similarity_df[cust_id].sort_values(ascending=False)[1:4]  # Top 3 (excluding itself)
    lookalikes[cust_id] = list(zip(similar_customers.index, similar_customers.values))

# Create Lookalike.csv
lookalike_df = pd.DataFrame({"CustomerID": lookalikes.keys(), "Recommendations": lookalikes.values()})
lookalike_df.head()

# This is now giving me information of how close one customer is to the given CustomerID with their level of similarity.
# i.e. we can say that C0190 is 99% similar to C0001

Unnamed: 0,CustomerID,Recommendations
0,C0001,"[(C0190, 0.9900070822932618), (C0048, 0.981854..."
1,C0002,"[(C0088, 0.8902838554961774), (C0083, 0.875029..."
2,C0003,"[(C0031, 0.9758179433935927), (C0052, 0.966751..."
3,C0004,"[(C0155, 0.9801789455240874), (C0087, 0.924261..."
4,C0005,"[(C0186, 0.9974386620285471), (C0007, 0.987428..."


In [7]:
Sakshi_Sharma_Lookalike = lookalike_df.to_csv("Sakshi_Sharma_Lookalike.csv", index=False)