In [1]:
import pandas as pd


In [6]:
customer_df = pd.read_csv('Customers.csv')
product_df = pd.read_csv('Products.csv')


In [7]:
region_encoded = pd.get_dummies(customer_df['Region'], prefix='Region', dtype=int)

In [8]:
from datetime import datetime
customer_df['SignupDate'] = pd.to_datetime(customer_df['SignupDate'])
customer_df['CustomerTime'] = (datetime.now() - customer_df['SignupDate']).dt.days

In [9]:
customer_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,CustomerTime
0,C0001,Lawrence Carroll,South America,2022-07-10,932
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1079
2,C0003,Michael Rivera,South America,2024-03-07,326
3,C0004,Kathleen Rodriguez,South America,2022-10-09,841
4,C0005,Laura Weber,Asia,2022-08-15,896


In [12]:
avg_price_per_category = product_df.groupby('Category')['Price'].mean().reset_index()
avg_price_per_category.columns = ['Category', 'AvgPrice']

In [13]:
category_encoded = pd.get_dummies(product_df['Category'], prefix='Category').mean().to_frame().T

In [14]:
customer_features = pd.concat([customer_df[['CustomerID', 'CustomerTime']], region_encoded], axis=1)
product_features = pd.concat([category_encoded] * len(customer_df), ignore_index=True)
final_features = pd.concat([customer_features.reset_index(drop=True), product_features], axis=1)
final_features.head()

Unnamed: 0,CustomerID,CustomerTime,Region_Asia,Region_Europe,Region_North America,Region_South America,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,932,0,0,0,1,0.26,0.25,0.26,0.23
1,C0002,1079,1,0,0,0,0.26,0.25,0.26,0.23
2,C0003,326,0,0,0,1,0.26,0.25,0.26,0.23
3,C0004,841,0,0,0,1,0.26,0.25,0.26,0.23
4,C0005,896,1,0,0,0,0.26,0.25,0.26,0.23


In [15]:
#Building the similarity model

from sklearn.metrics.pairwise import cosine_similarity

customer_ids = final_features['CustomerID']
feature_matrix = final_features.drop(columns=['CustomerID'])


I will use cosine similarity for comparing the similarity of the different customers


In [16]:
similarity_matrix = cosine_similarity(feature_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)

In [17]:
similarity_df.head()

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,0.999999,0.999998,1.0,0.999999,0.999999,0.999999,0.999996,0.999998,0.999999,...,0.999997,1.0,0.999998,0.999992,0.999974,0.999999,0.999998,0.999999,0.999999,0.999998
C0002,0.999999,1.0,0.999994,0.999999,1.0,0.999996,1.0,0.999996,0.999998,0.999999,...,0.999993,0.999999,1.0,0.999992,0.999965,0.999999,0.999998,0.999999,0.999999,1.0
C0003,0.999998,0.999994,1.0,0.999998,0.999994,1.0,0.999994,0.999992,0.999993,0.999994,...,1.0,0.999998,0.999994,0.999988,0.999987,0.999994,0.999994,0.999994,0.999994,0.999994
C0004,1.0,0.999999,0.999998,1.0,0.999999,0.999999,0.999999,0.999996,0.999997,0.999998,...,0.999997,1.0,0.999998,0.999992,0.999974,0.999999,0.999998,0.999999,0.999998,0.999998
C0005,0.999999,1.0,0.999994,0.999999,1.0,0.999996,1.0,0.999996,0.999998,0.999999,...,0.999993,0.999999,1.0,0.999992,0.999965,0.999999,0.999998,0.999999,0.999999,1.0


In [18]:
def get_top_n_similar_customers(customer_id, similarity_df, top_n=3):
    similar_scores = similarity_df.loc[customer_id].sort_values(ascending=False)
    similar_scores = similar_scores.drop(customer_id)
    return similar_scores.head(top_n)

lookalike_map = {}

for customer_id in customer_ids[:20]:
    similar_customers = get_top_n_similar_customers(customer_id, similarity_df)
    lookalike_map[customer_id] = list(zip(similar_customers.index, similar_customers.values))
    

In [19]:
lookalike_map

{'C0001': [('C0112', 0.9999999999925892),
  ('C0025', 0.9999999999705456),
  ('C0071', 0.9999999999341492)],
 'C0002': [('C0134', 0.9999999999832064),
  ('C0045', 0.9999999999320677),
  ('C0040', 0.9999999997673304)],
 'C0003': [('C0052', 0.9999999871225099),
  ('C0126', 0.9999999871225099),
  ('C0076', 0.9999999701523943)],
 'C0004': [('C0108', 0.9999999998526328),
  ('C0102', 0.999999999373497),
  ('C0192', 0.9999999993194105)],
 'C0005': [('C0159', 0.9999999999229701),
  ('C0106', 0.9999999984722103),
  ('C0007', 0.9999999971210749)],
 'C0006': [('C0076', 0.9999999582719943),
  ('C0181', 0.9999999408484918),
  ('C0052', 0.9999999305361762)],
 'C0007': [('C0159', 0.9999999979858796),
  ('C0175', 0.999999997363488),
  ('C0005', 0.9999999971210749)],
 'C0008': [('C0189', 0.9999999974516989),
  ('C0016', 0.9999999971530179),
  ('C0183', 0.9999999968369206)],
 'C0009': [('C0121', 0.9999999965159265),
  ('C0170', 0.9999999921096518),
  ('C0164', 0.9999999670557241)],
 'C0010': [('C0062', 

In [20]:
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_map.items()
])
lookalike_df.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0112, 0.9999999999925892), (C0025, 0.999999..."
1,C0002,"[(C0134, 0.9999999999832064), (C0045, 0.999999..."
2,C0003,"[(C0052, 0.9999999871225099), (C0126, 0.999999..."
3,C0004,"[(C0108, 0.9999999998526328), (C0102, 0.999999..."
4,C0005,"[(C0159, 0.9999999999229701), (C0106, 0.999999..."


In [21]:
lookalike_df.to_csv('Lookalike.csv', index=False)