In [144]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# Data Preparation

In [145]:
# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

In [146]:
# Merge transactions with products to get category information
transactions = pd.merge(transactions, products[['ProductID', 'Category']], on='ProductID', how='left')
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Electronics
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Electronics
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Electronics


In [147]:
# Aggregate transaction data by customer and category
category_aggregates = transactions.pivot_table(index='CustomerID', columns='Category', values='Quantity', aggfunc='sum', fill_value=0).reset_index()
category_aggregates.head()

Category,CustomerID,Books,Clothing,Electronics,Home Decor
0,C0001,2,0,7,3
1,C0002,0,4,0,6
2,C0003,0,4,4,6
3,C0004,8,0,6,9
4,C0005,0,0,4,3


In [148]:
# Aggregate total transaction data
customer_transactions = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()
customer_transactions.head()

Unnamed: 0,CustomerID,TotalValue,Quantity
0,C0001,3354.52,12
1,C0002,1862.74,10
2,C0003,2725.38,14
3,C0004,5354.88,23
4,C0005,2034.24,7


In [149]:
# Merge with customer data
customer_data = pd.merge(customers, customer_transactions, on='CustomerID', how='left').fillna(0)
customer_data = pd.merge(customer_data, category_aggregates, on='CustomerID', how='left').fillna(0)
customer_data.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TotalValue,Quantity,Books,Clothing,Electronics,Home Decor
0,C0001,Lawrence Carroll,South America,2022-07-10,3354.52,12.0,2.0,0.0,7.0,3.0
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1862.74,10.0,0.0,4.0,0.0,6.0
2,C0003,Michael Rivera,South America,2024-03-07,2725.38,14.0,0.0,4.0,4.0,6.0
3,C0004,Kathleen Rodriguez,South America,2022-10-09,5354.88,23.0,8.0,0.0,6.0,9.0
4,C0005,Laura Weber,Asia,2022-08-15,2034.24,7.0,0.0,0.0,4.0,3.0


# Feature Engineering

In [150]:
# Convert categorical data to numerical
customer_data['Region'] = customer_data['Region'].astype('category').cat.codes
customer_data.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TotalValue,Quantity,Books,Clothing,Electronics,Home Decor
0,C0001,Lawrence Carroll,3,2022-07-10,3354.52,12.0,2.0,0.0,7.0,3.0
1,C0002,Elizabeth Lutz,0,2022-02-13,1862.74,10.0,0.0,4.0,0.0,6.0
2,C0003,Michael Rivera,3,2024-03-07,2725.38,14.0,0.0,4.0,4.0,6.0
3,C0004,Kathleen Rodriguez,3,2022-10-09,5354.88,23.0,8.0,0.0,6.0,9.0
4,C0005,Laura Weber,0,2022-08-15,2034.24,7.0,0.0,0.0,4.0,3.0


In [155]:
# Select features
features = customer_data[['Region', 'TotalValue', 'Quantity', 'Books', 'Clothing', 'Home Decor', 'Electronics']]
# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Using K Nearest Neighbours (n = 4)

In [152]:
# Fit KNN model
knn = NearestNeighbors(n_neighbors = 4, metric='cosine')
knn.fit(features_scaled)

# Find the top 3 neighbors for the first 20 customers
distances, indices = knn.kneighbors(features_scaled[:20])

In [153]:
lookalike_knn = {}

for i, customer_id in enumerate(customer_data['CustomerID'][:20]):
    # Get the indices and distances of the top 3 lookalikes
    lookalike_indices = indices[i][1:]  # Exclude the customer themselves
    lookalike_distances = distances[i][1:]
    
    # Map customer ID to lookalikes and their similarity scores
    lookalike_knn[customer_id] = [(customer_data['CustomerID'][idx], 1 - dist) for idx, dist in zip(lookalike_indices, lookalike_distances)]

# Convert to DataFrame for output
lookalike_df_knn = pd.DataFrame.from_dict(lookalike_knn, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])

# Save to CSV
lookalike_df_knn.to_csv('Soham_Gupta_Lookalike.csv')

In [154]:
lookalike_df_knn.head()

Unnamed: 0,Lookalike1,Lookalike2,Lookalike3
C0001,"(C0120, 0.8888322228730283)","(C0154, 0.8495110924258484)","(C0026, 0.8333766140017796)"
C0002,"(C0159, 0.9728421756438311)","(C0178, 0.9592723523697879)","(C0164, 0.860996911896778)"
C0003,"(C0195, 0.8717104992714305)","(C0031, 0.8201693612903813)","(C0133, 0.7865752036364518)"
C0004,"(C0012, 0.9257230976494083)","(C0113, 0.9145555624404794)","(C0065, 0.9130181829832341)"
C0005,"(C0007, 0.9808381443227645)","(C0140, 0.9628495998371851)","(C0197, 0.8763099434802745)"
