In [216]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

## Load Dataset

In [217]:
customers = pd.read_csv('dataset/Customers.csv')
products = pd.read_csv('dataset/Products.csv')
transactions = pd.read_csv('dataset/Transactions.csv')

## Data Preprocessing

In [218]:
transactions = pd.merge(transactions, products, on='ProductID')

customer_transactions = pd.merge(transactions, customers, on='CustomerID')

customer_transactions.drop('Price_y', axis=1, inplace=True) 
customer_transactions.rename(columns={'Price_x':'Price'}, inplace=True)

customer_transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,ProductName,Category,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,Timothy Perez,Europe,2022-03-15


In [219]:
total_spending = customer_transactions.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spending.columns = ['CustomerID', 'TotalSpending']

In [220]:
avg_transaction_value = customer_transactions.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_transaction_value.columns = ['CustomerID', 'AvgTransactionValue']

In [221]:
# Get the favorite product category per customer
favorite_category = customer_transactions.groupby(['CustomerID', 'Category']).size().reset_index(name='Count')
favorite_category = favorite_category.loc[favorite_category.groupby('CustomerID')['Count'].idxmax()]
favorite_category = favorite_category[['CustomerID', 'Category']]
favorite_category.columns = ['CustomerID', 'FavoriteCategory']

In [222]:
# Merge features
customer_features = pd.merge(total_spending, avg_transaction_value, on='CustomerID')
customer_features = pd.merge(customer_features, favorite_category, on='CustomerID')
customer_features = pd.merge(customer_features, customers[['CustomerID', 'Region']], on='CustomerID')

In [223]:
customer_features.head()

Unnamed: 0,CustomerID,TotalSpending,AvgTransactionValue,FavoriteCategory,Region
0,C0001,3354.52,670.904,Electronics,South America
1,C0002,1862.74,465.685,Clothing,Asia
2,C0003,2725.38,681.345,Home Decor,South America
3,C0004,5354.88,669.36,Books,South America
4,C0005,2034.24,678.08,Electronics,Asia


In [224]:
# One-hot encode categorical features
customer_features = pd.get_dummies(customer_features, columns=['FavoriteCategory', 'Region'])

In [225]:
customer_features.head()

Unnamed: 0,CustomerID,TotalSpending,AvgTransactionValue,FavoriteCategory_Books,FavoriteCategory_Clothing,FavoriteCategory_Electronics,FavoriteCategory_Home Decor,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,3354.52,670.904,False,False,True,False,False,False,False,True
1,C0002,1862.74,465.685,False,True,False,False,True,False,False,False
2,C0003,2725.38,681.345,False,False,False,True,False,False,False,True
3,C0004,5354.88,669.36,True,False,False,False,False,False,False,True
4,C0005,2034.24,678.08,False,False,True,False,True,False,False,False


In [226]:
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features.iloc[:, 1:])

customer_features_scaled

array([[-0.06170143, -0.07026341, -0.71244351, ..., -0.57928445,
        -0.54831888,  1.54041597],
       [-0.87774353, -0.93493297, -0.71244351, ..., -0.57928445,
        -0.54831888, -0.6491753 ],
       [-0.40585722, -0.02627131, -0.71244351, ..., -0.57928445,
        -0.54831888,  1.54041597],
       ...,
       [-1.38697529, -0.93396389, -0.71244351, ...,  1.72626765,
        -0.54831888, -0.6491753 ],
       [-0.81399315, -0.81217559, -0.71244351, ...,  1.72626765,
        -0.54831888, -0.6491753 ],
       [ 0.70636652,  1.11292648, -0.71244351, ..., -0.57928445,
        -0.54831888, -0.6491753 ]])

## Model Creation

In [227]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(customer_features_scaled)

In [228]:
lookalike_recommendations = {}

first_20_customers = customers['CustomerID'].head(20).tolist()

for customer in first_20_customers:
    customer_index = customer_features[customer_features['CustomerID'] == customer].index[0]
    
    similarity_scores = similarity_matrix[customer_index]
    
    sorted_indices = np.argsort(similarity_scores)[::-1]
    
    top_3_indices = sorted_indices[1:4]
    top_3_customers = customer_features.iloc[top_3_indices]['CustomerID'].tolist()
    top_3_scores = similarity_scores[top_3_indices].tolist()
    
    lookalike_recommendations[customer] = list(zip(top_3_customers, top_3_scores))

lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_recommendations.keys(),
    'LookalikeCustomers': lookalike_recommendations.values()
})

lookalike_df.head()

Unnamed: 0,CustomerID,LookalikeCustomers
0,C0001,"[(C0190, 0.9912646091305255), (C0181, 0.985897..."
1,C0002,"[(C0088, 0.9977456261265659), (C0134, 0.987231..."
2,C0003,"[(C0052, 0.9953373630582804), (C0152, 0.992077..."
3,C0004,"[(C0155, 0.9880648609193856), (C0169, 0.976769..."
4,C0005,"[(C0146, 0.99446449052155), (C0186, 0.99171355..."


In [229]:
lookalike_df.to_csv('Ojas_Sinha_Lookalike.csv', index=False)

In [230]:
def get_customer_recommendations(customer_id:str):
    customer_index = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    
    similarity_scores = similarity_matrix[customer_index]
    
    sorted_indices = np.argsort(similarity_scores)[::-1]
    
    top_3_indices = sorted_indices[1:4]
    top_3_customers = customer_features.iloc[top_3_indices]['CustomerID'].tolist()
    top_3_scores = similarity_scores[top_3_indices].tolist()
    
    recommendations = list(zip(top_3_customers, top_3_scores))
    
    return recommendations

In [231]:
get_customer_recommendations('C0001')

[('C0190', 0.9912646091305255),
 ('C0181', 0.9858974245912884),
 ('C0048', 0.9842447396023727)]