In [34]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [3]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [14]:
def preprocess_customers(customers):
    # Convert SignupDate to datetime and calculate days since signup
    customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
    customers['DaysSinceSignup'] = (pd.Timestamp.now() - customers['SignupDate']).dt.days
    
    # One-hot encode Region
    region_encoded = pd.get_dummies(customers['Region'], prefix='Region')
    
    # Combine processed features
    processed_customers = pd.concat([customers[['CustomerID', 'DaysSinceSignup']], region_encoded], axis=1)
    processed_customers.set_index('CustomerID', inplace=True)
    return processed_customers

processed_customers = preprocess_customers(customers)

In [15]:
processed_customers

Unnamed: 0_level_0,DaysSinceSignup,Region_Asia,Region_Europe,Region_North America,Region_South America
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C0001,929,False,False,False,True
C0002,1076,True,False,False,False
C0003,323,False,False,False,True
C0004,838,False,False,False,True
C0005,893,True,False,False,False
...,...,...,...,...,...
C0196,962,False,True,False,False
C0197,675,False,True,False,False
C0198,1062,False,True,False,False
C0199,783,False,True,False,False


In [18]:
# Merge datasets
transactions_products = transactions.merge(products, on='ProductID', how='left')
full_data = transactions_products.merge(processed_customers, on='CustomerID', how='left')

In [19]:
# Aggregate purchase data by customer
customer_product_matrix = full_data.pivot_table(
    index='CustomerID', 
    columns='Category', 
    values='Quantity', 
    aggfunc='sum', 
    fill_value=0
)

In [20]:
customer_product_matrix

Category,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,2,0,7,3
C0002,0,4,0,6
C0003,0,4,4,6
C0004,8,0,6,9
C0005,0,0,4,3
...,...,...,...,...
C0196,3,4,0,5
C0197,0,0,6,3
C0198,0,2,1,0
C0199,0,0,3,6


In [21]:
# Combine demographic and product data
combined_features = customer_product_matrix.join(processed_customers, how='inner')

In [22]:
combined_features

Unnamed: 0_level_0,Books,Clothing,Electronics,Home Decor,DaysSinceSignup,Region_Asia,Region_Europe,Region_North America,Region_South America
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
C0001,2,0,7,3,929,False,False,False,True
C0002,0,4,0,6,1076,True,False,False,False
C0003,0,4,4,6,323,False,False,False,True
C0004,8,0,6,9,838,False,False,False,True
C0005,0,0,4,3,893,True,False,False,False
...,...,...,...,...,...,...,...,...,...
C0196,3,4,0,5,962,False,True,False,False
C0197,0,0,6,3,675,False,True,False,False
C0198,0,2,1,0,1062,False,True,False,False
C0199,0,0,3,6,783,False,True,False,False


In [23]:
# Standardize data
scaler = StandardScaler()
combined_features_scaled = scaler.fit_transform(combined_features)
combined_features_scaled = pd.DataFrame(combined_features_scaled, index=combined_features.index, columns=combined_features.columns)

In [24]:
combined_features_scaled

Unnamed: 0_level_0,Books,Clothing,Electronics,Home Decor,DaysSinceSignup,Region_Asia,Region_Europe,Region_North America,Region_South America
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
C0001,-0.464594,-0.963893,1.255863,-0.069051,1.148752,-0.532795,-0.579284,-0.548319,1.540416
C0002,-1.117981,0.336546,-1.027971,0.912454,1.600431,1.876893,-0.579284,-0.548319,-0.649175
C0003,-1.117981,0.336546,0.277077,0.912454,-0.713270,-0.532795,-0.579284,-0.548319,1.540416
C0004,1.495566,-0.963893,0.929601,1.893958,0.869141,-0.532795,-0.579284,-0.548319,1.540416
C0005,-1.117981,-0.963893,0.277077,-0.069051,1.038137,1.876893,-0.579284,-0.548319,-0.649175
...,...,...,...,...,...,...,...,...,...
C0196,-0.137901,0.336546,-1.027971,0.585286,1.250149,-0.532795,1.726268,-0.548319,-0.649175
C0197,-1.117981,-0.963893,0.929601,-0.069051,0.368300,-0.532795,1.726268,-0.548319,-0.649175
C0198,-1.117981,-0.313674,-0.701709,-1.050555,1.557414,-0.532795,1.726268,-0.548319,-0.649175
C0199,-1.117981,-0.963893,-0.049185,0.912454,0.700146,-0.532795,1.726268,-0.548319,-0.649175


In [25]:
# Compute similarity
similarity_matrix = cosine_similarity(combined_features_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=combined_features.index, columns=combined_features.index)

In [26]:
similarity_df

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.000000,-0.078342,0.442355,0.601137,0.213813,0.137702,0.231037,-0.307783,-0.209478,-0.410813,...,0.245993,0.840122,-0.218339,-0.323372,0.170110,-0.217942,0.224473,0.046660,0.096102,-0.488874
C0002,-0.078342,1.000000,-0.074810,-0.108672,0.751679,-0.232674,0.793133,-0.048254,-0.058810,0.104696,...,-0.564501,-0.149402,0.198709,-0.344265,-0.020123,0.300867,-0.090541,0.215151,0.187141,0.702551
C0003,0.442355,-0.074810,1.000000,0.316279,-0.167979,0.518162,-0.139245,0.193646,-0.210577,-0.215548,...,0.058818,0.282931,-0.656086,-0.048248,0.938023,-0.273026,-0.084574,-0.336276,-0.025268,-0.181406
C0004,0.601137,-0.108672,0.316279,1.000000,-0.112122,0.286826,-0.030650,-0.225153,-0.611075,-0.564125,...,0.372190,0.339473,0.059772,-0.206347,0.309180,-0.077754,-0.120819,-0.389925,0.014180,-0.268242
C0005,0.213813,0.751679,-0.167979,-0.112122,1.000000,-0.491003,0.991886,-0.281694,0.009318,-0.254759,...,-0.386610,0.078907,0.289882,-0.302842,-0.249919,-0.063511,0.205148,0.205551,0.201427,0.407377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,-0.217942,0.300867,-0.273026,-0.077754,-0.063511,-0.170903,-0.007954,-0.174388,0.576557,0.659860,...,-0.395166,-0.208105,-0.086673,-0.410446,-0.177709,1.000000,0.460384,0.748008,0.724437,0.022219
C0197,0.224473,-0.090541,-0.084574,-0.120819,0.205148,-0.577432,0.204946,-0.208645,0.698912,0.178930,...,-0.376541,0.004934,-0.379691,-0.200723,-0.170270,0.460384,1.000000,0.675457,0.861629,-0.480337
C0198,0.046660,0.215151,-0.336276,-0.389925,0.205551,-0.310606,0.192315,-0.340958,0.844741,0.626354,...,-0.323724,0.178915,-0.098057,-0.424795,-0.445250,0.748008,0.675457,1.000000,0.681451,-0.186321
C0199,0.096102,0.187141,-0.025268,0.014180,0.201427,-0.440443,0.249200,-0.189240,0.632327,0.195613,...,-0.479218,-0.145424,-0.357427,-0.246054,0.001192,0.724437,0.861629,0.681451,1.000000,-0.333889


In [30]:
# Get top 3 lookalikes for customers C0001 - C0020
lookalikes = {}
for customer_id in customers['CustomerID'][:20]:
    if customer_id in similarity_df.index:
        similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]
        lookalikes[customer_id] = list(similar_customers.items())

In [31]:
lookalike_df = pd.DataFrame({
    'CustomerID': lookalikes.keys(),
    'Lookalikes': [str(v) for v in lookalikes.values()]
})

In [32]:
lookalike_df

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0184', 0.8984872945344419), ('C0120', 0.89..."
1,C0002,"[('C0159', 0.9791649363227918), ('C0134', 0.90..."
2,C0003,"[('C0195', 0.9380228042679383), ('C0031', 0.92..."
3,C0004,"[('C0148', 0.9083726875591586), ('C0113', 0.90..."
4,C0005,"[('C0007', 0.9918864656974362), ('C0140', 0.90..."
5,C0006,"[('C0187', 0.8745045513757316), ('C0137', 0.81..."
6,C0007,"[('C0005', 0.9918864656974362), ('C0140', 0.85..."
7,C0008,"[('C0067', 0.8132364960146746), ('C0046', 0.79..."
8,C0009,"[('C0061', 0.8941300211715121), ('C0119', 0.88..."
9,C0010,"[('C0062', 0.8165966708570689), ('C0135', 0.77..."


In [33]:
# Save results to Lookalike.csv
lookalike_df.to_csv('Lookalike.csv', index=False)
print("Lookalike recommendations have been saved to Lookalike.csv")

Lookalike recommendations have been saved to Lookalike.csv


In [36]:
similarity_df

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.000000,-0.078342,0.442355,0.601137,0.213813,0.137702,0.231037,-0.307783,-0.209478,-0.410813,...,0.245993,0.840122,-0.218339,-0.323372,0.170110,-0.217942,0.224473,0.046660,0.096102,-0.488874
C0002,-0.078342,1.000000,-0.074810,-0.108672,0.751679,-0.232674,0.793133,-0.048254,-0.058810,0.104696,...,-0.564501,-0.149402,0.198709,-0.344265,-0.020123,0.300867,-0.090541,0.215151,0.187141,0.702551
C0003,0.442355,-0.074810,1.000000,0.316279,-0.167979,0.518162,-0.139245,0.193646,-0.210577,-0.215548,...,0.058818,0.282931,-0.656086,-0.048248,0.938023,-0.273026,-0.084574,-0.336276,-0.025268,-0.181406
C0004,0.601137,-0.108672,0.316279,1.000000,-0.112122,0.286826,-0.030650,-0.225153,-0.611075,-0.564125,...,0.372190,0.339473,0.059772,-0.206347,0.309180,-0.077754,-0.120819,-0.389925,0.014180,-0.268242
C0005,0.213813,0.751679,-0.167979,-0.112122,1.000000,-0.491003,0.991886,-0.281694,0.009318,-0.254759,...,-0.386610,0.078907,0.289882,-0.302842,-0.249919,-0.063511,0.205148,0.205551,0.201427,0.407377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,-0.217942,0.300867,-0.273026,-0.077754,-0.063511,-0.170903,-0.007954,-0.174388,0.576557,0.659860,...,-0.395166,-0.208105,-0.086673,-0.410446,-0.177709,1.000000,0.460384,0.748008,0.724437,0.022219
C0197,0.224473,-0.090541,-0.084574,-0.120819,0.205148,-0.577432,0.204946,-0.208645,0.698912,0.178930,...,-0.376541,0.004934,-0.379691,-0.200723,-0.170270,0.460384,1.000000,0.675457,0.861629,-0.480337
C0198,0.046660,0.215151,-0.336276,-0.389925,0.205551,-0.310606,0.192315,-0.340958,0.844741,0.626354,...,-0.323724,0.178915,-0.098057,-0.424795,-0.445250,0.748008,0.675457,1.000000,0.681451,-0.186321
C0199,0.096102,0.187141,-0.025268,0.014180,0.201427,-0.440443,0.249200,-0.189240,0.632327,0.195613,...,-0.479218,-0.145424,-0.357427,-0.246054,0.001192,0.724437,0.861629,0.681451,1.000000,-0.333889


In [37]:
lookalikes

{'C0001': [('C0184', 0.8984872945344419),
  ('C0120', 0.8945133695232633),
  ('C0168', 0.856149518280859)],
 'C0002': [('C0159', 0.9791649363227918),
  ('C0134', 0.9045664151273745),
  ('C0106', 0.8846502971897854)],
 'C0003': [('C0195', 0.9380228042679383),
  ('C0031', 0.9264525269784128),
  ('C0039', 0.8851972430413743)],
 'C0004': [('C0148', 0.9083726875591586),
  ('C0113', 0.9081421155797567),
  ('C0012', 0.7475380685496319)],
 'C0005': [('C0007', 0.9918864656974362),
  ('C0140', 0.9093175950954265),
  ('C0128', 0.8426363132770903)],
 'C0006': [('C0187', 0.8745045513757316),
  ('C0137', 0.8168088348224718),
  ('C0048', 0.7778708045253804)],
 'C0007': [('C0005', 0.9918864656974362),
  ('C0140', 0.8549305376803197),
  ('C0159', 0.8337626698570737)],
 'C0008': [('C0067', 0.8132364960146746),
  ('C0046', 0.7981585957778714),
  ('C0059', 0.7718260186462986)],
 'C0009': [('C0061', 0.8941300211715121),
  ('C0119', 0.888377250795355),
  ('C0014', 0.8821315549321366)],
 'C0010': [('C0062', 

In [38]:
def evaluate_similarity():
    errors = []
    for customer_id, similar_list in lookalikes.items():
        actual_similarities = similarity_df.loc[customer_id, [sim[0] for sim in similar_list]].values
        predicted_similarities = [sim[1] for sim in similar_list]
        errors.append(mean_squared_error(actual_similarities, predicted_similarities))
    return sum(errors) / len(errors)

mse = evaluate_similarity()
print(f"Mean Squared Error of similarity scores: {mse}")

Mean Squared Error of similarity scores: 0.0
