In [9]:
import pandas as pd

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge the data on CustomerID and ProductID
merged_data = pd.merge(transactions, products, left_on='ProductID', right_on='ProductID', how='left')
merged_data = pd.merge(merged_data, customers, left_on='CustomerID', right_on='CustomerID', how='left')

merged_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [10]:
# Feature Engineering
customer_transactions = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_quantity=('Quantity', 'sum'),
    unique_products=('ProductID', 'nunique')
).reset_index()

customer_transactions.head()

Unnamed: 0,CustomerID,total_spent,total_quantity,unique_products
0,C0001,3354.52,12,5
1,C0002,1862.74,10,4
2,C0003,2725.38,14,4
3,C0004,5354.88,23,8
4,C0005,2034.24,7,3


In [11]:
#Data Normalization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
customer_transactions_scaled = scaler.fit_transform(customer_transactions[['total_spent', 'total_quantity', 'unique_products']])

customer_transactions_scaled[:5]

array([[-0.06170143, -0.12203296,  0.05004655],
       [-0.87774353, -0.44800021, -0.42420409],
       [-0.40585722,  0.20393428, -0.42420409],
       [ 1.03254704,  1.67078689,  1.47279848],
       [-0.78392861, -0.93695108, -0.89845473]])

In [12]:
#PCA for Dimensionality Reduction
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
customer_transactions_pca = pca.fit_transform(customer_transactions_scaled)

customer_transactions_pca[:5]

array([[-0.07922921,  0.08513238],
       [-1.00792369,  0.30391738],
       [-0.35168428, -0.04990284],
       [ 2.41625507,  0.29709987],
       [-1.51324571, -0.08187395]])

In [13]:
# Cosine Similarity Calculation
from sklearn.metrics.pairwise import cosine_similarity

cos_sim = cosine_similarity(customer_transactions_pca)

cos_sim_df = pd.DataFrame(cos_sim, index=customer_transactions['CustomerID'], columns=customer_transactions['CustomerID'])

cos_sim_df.head()

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,0.863594,0.571672,-0.586842,0.640728,-0.639244,0.496269,-0.199404,0.791688,0.902533,...,0.98789,0.819992,0.498755,-0.132693,-0.560158,-0.66866,0.64775,0.684502,0.839821,-0.966182
C0002,0.863594,1.0,0.907369,-0.915034,0.940428,-0.164326,0.866295,-0.666265,0.991718,0.996536,...,0.931363,0.996725,0.867723,-0.614322,-0.901411,-0.202553,0.94351,0.95869,0.998969,-0.9644
C0003,0.571672,0.907369,1.0,-0.999827,0.996226,0.265516,0.99602,-0.917998,0.953839,0.869269,...,0.692051,0.93839,0.996272,-0.889084,-0.999902,0.227831,0.995387,0.989451,0.925512,-0.76391
C0004,-0.586842,-0.915034,-0.999827,1.0,-0.997669,-0.247528,-0.994189,0.910459,-0.959263,-0.878318,...,-0.705364,-0.944658,-0.994494,0.880411,0.999469,-0.209671,-0.997,-0.991976,-0.9324,0.775787
C0005,0.640728,0.940428,0.996226,-0.997669,1.0,0.180835,0.984526,-0.880113,0.976306,0.908894,...,0.752091,0.964843,0.985024,-0.845999,-0.994917,0.14246,0.999958,0.998291,0.95489,-0.817037


In [14]:
# Get Top 3 Similar Customers for Each Customer
def get_top_similar_customers(customer_id, top_n=3):
    similar_scores = cos_sim_df[customer_id].sort_values(ascending=False)[1:top_n+1]
    return similar_scores

# Example
get_top_similar_customers('C0001')

CustomerID
C0056    0.999814
C0174    0.999534
C0055    0.999512
Name: C0001, dtype: float64

In [15]:
# Lookalike Recommendations for the First 20 Customers
lookalike_data = {}

for customer_id in customer_transactions['CustomerID'][:20]:
    lookalike_data[customer_id] = get_top_similar_customers(customer_id)

lookalike_df = pd.DataFrame([(key, val.index.tolist(), val.values.tolist()) for key, val in lookalike_data.items()],
                            columns=['CustomerID', 'Lookalike_Customers', 'Similarity_Scores'])

lookalike_df.head()

Unnamed: 0,CustomerID,Lookalike_Customers,Similarity_Scores
0,C0001,"[C0056, C0174, C0055]","[0.9998135484854056, 0.9995343966044378, 0.999..."
1,C0002,"[C0029, C0025, C0031]","[0.9998262686091017, 0.9997027220387564, 0.999..."
2,C0003,"[C0167, C0042, C0133]","[0.9999999843129921, 0.9999976410047594, 0.999..."
3,C0004,"[C0075, C0091, C0108]","[0.999994698864954, 0.9996507840777626, 0.9996..."
4,C0005,"[C0095, C0197, C0112]","[0.9999701739750527, 0.9999578444720515, 0.999..."


In [16]:
lookalike_df.to_csv('Shreya_Goswami_Lookalike.csv', index=False)