In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
customers = pd.read_csv("dataset/Customers.csv")
products = pd.read_csv("dataset/Products.csv")
transactions = pd.read_csv("dataset/Transactions.csv")


In [3]:
transactions = transactions.merge(products, on="ProductID", how="left")
data = transactions.merge(customers, on="CustomerID", how="left")
data.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [5]:
customer_features = data.groupby("CustomerID").agg({
    "TotalValue": "sum", 
    "TransactionID": "count",  
    "Price_x": "mean",  
    "Category": lambda x: x.mode()[0]  
}).reset_index()

In [6]:
customer_profiles = customers.merge(customer_features, on="CustomerID", how="left")

In [None]:
encoder = OneHotEncoder(sparse=False)
encoded_region = encoder.fit_transform(customer_profiles[["Region"]])
encoded_category = encoder.fit_transform(customer_profiles[["Category"]])

features = np.hstack([
    customer_profiles[["TotalValue", "TransactionID", "Price_x"]].fillna(0),
    encoded_region, encoded_category
])

In [8]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
similarity_matrix = cosine_similarity(scaled_features)


In [9]:
lookalike_map = {}
for idx in range(20):
    customer_id = customer_profiles.iloc[idx]["CustomerID"]
    similarity_scores = similarity_matrix[idx]
    similar_customers = np.argsort(similarity_scores)[::-1][1:4]  
    lookalike_map[customer_id] = [
        (customer_profiles.iloc[i]["CustomerID"], similarity_scores[i]) for i in similar_customers
    ]

In [10]:
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_map.keys(),
    "Recommendations": lookalike_map.values()
})
lookalike_df.to_csv("Lookalike_4.csv", index=False)