In [12]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
customers = pd.read_csv("./Customers.csv")
products = pd.read_csv("./Products.csv")
transactions = pd.read_csv("./Transactions.csv")
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")
merged_data.head(5)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [22]:
customer_profiles = merged_data.groupby('CustomerID').agg({
    'Region': 'first',
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price_x': 'mean'  
}).reset_index()
 
encoder = OneHotEncoder()
encoded_regions = encoder.fit_transform(customer_profiles[['Region']]).toarray()
encoded_features = np.hstack((encoded_regions, customer_profiles[['TotalValue', 'Quantity', 'Price_x']].values))
  
similarity_matrix = cosine_similarity(encoded_features)
 
lookalike_data = {}
for i in range(20):
    customer_id = customer_profiles['CustomerID'].iloc[i]
    similarities = similarity_matrix[i]
    top_matches = np.argsort(-similarities)[1:4]
    lookalike_data[customer_id] = [
        (customer_profiles['CustomerID'].iloc[j], round(similarities[j], 2)) for j in top_matches
    ]
 
lookalike_df = pd.DataFrame([{
    "CustomerID": cust_id, 
    "Lookalikes": lookalikes
} for cust_id, lookalikes in lookalike_data.items()])

lookalike_df.to_csv("Peddireddy_Srijith_Lookalike.csv", index=False)
