In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import csv

In [115]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

In [117]:
full_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [119]:
full_data.drop('Price_x', axis=1, inplace=True)
full_data.rename(columns={'Price_y': 'ProductPrice'}, inplace=True)

In [121]:
full_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,CustomerName,Region,SignupDate,ProductName,Category,ProductPrice
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [123]:
customer_features = full_data.groupby('CustomerID').agg(
    TotalSpending=('TotalValue', 'sum'),
    NumTransactions=('TransactionID', 'count'),
    AvgSpending=('TotalValue', 'mean'),
    DistinctCategories=('Category', 'nunique'),
    MostPurchasedCategory=('Category', lambda x: x.mode()[0])
).reset_index()

In [135]:
customer_features.head()

Unnamed: 0,CustomerID,TotalSpending,NumTransactions,AvgSpending,DistinctCategories,MostPurchasedCategory
0,C0001,3354.52,5,670.904,3,Electronics
1,C0002,1862.74,4,465.685,2,Clothing
2,C0003,2725.38,4,681.345,3,Home Decor
3,C0004,5354.88,8,669.36,3,Books
4,C0005,2034.24,3,678.08,2,Electronics


In [127]:
final_data = pd.merge(customer_features, full_data[['CustomerID', 'Region']].drop_duplicates(), on='CustomerID')

In [129]:
final_data.head()

Unnamed: 0,CustomerID,TotalSpending,NumTransactions,AvgSpending,DistinctCategories,MostPurchasedCategory,Region
0,C0001,3354.52,5,670.904,3,Electronics,South America
1,C0002,1862.74,4,465.685,2,Clothing,Asia
2,C0003,2725.38,4,681.345,3,Home Decor,South America
3,C0004,5354.88,8,669.36,3,Books,South America
4,C0005,2034.24,3,678.08,2,Electronics,Asia


Cosine Similarity is used for this because we have both numerical columns and categorical columns (which we will change into numeric using one hot encoding).

All the numerical columns must be on same scale before performing cosine similarity.

#### Scaling Numerical Data

In [141]:
numerical_columns = ['TotalSpending', 'NumTransactions', 'AvgSpending', 'DistinctCategories']
scaler = MinMaxScaler()
final_data[numerical_columns] = scaler.fit_transform(final_data[numerical_columns])

In [143]:
final_data.head()

Unnamed: 0,CustomerID,TotalSpending,NumTransactions,AvgSpending,DistinctCategories,MostPurchasedCategory,Region
0,C0001,0.308942,0.4,0.474336,0.666667,Electronics,South America
1,C0002,0.168095,0.3,0.30894,0.333333,Clothing,Asia
2,C0003,0.249541,0.3,0.482751,0.666667,Home Decor,South America
3,C0004,0.497806,0.7,0.473092,0.666667,Books,South America
4,C0005,0.184287,0.2,0.48012,0.333333,Electronics,Asia


#### OneHotEncoding for Categorical Data

In [145]:
encoded_data = pd.get_dummies(final_data, columns=['Region', 'MostPurchasedCategory'], drop_first=True)
encoded_data.head()

Unnamed: 0,CustomerID,TotalSpending,NumTransactions,AvgSpending,DistinctCategories,Region_Europe,Region_North America,Region_South America,MostPurchasedCategory_Clothing,MostPurchasedCategory_Electronics,MostPurchasedCategory_Home Decor
0,C0001,0.308942,0.4,0.474336,0.666667,False,False,True,False,True,False
1,C0002,0.168095,0.3,0.30894,0.333333,False,False,False,True,False,False
2,C0003,0.249541,0.3,0.482751,0.666667,False,False,True,False,False,True
3,C0004,0.497806,0.7,0.473092,0.666667,False,False,True,False,False,False
4,C0005,0.184287,0.2,0.48012,0.333333,False,False,False,False,True,False


In [147]:
feature_matrix = encoded_data.drop(columns=['CustomerID'])   # Excluding CustomerID for applying similarity matrix
feature_matrix.head()

Unnamed: 0,TotalSpending,NumTransactions,AvgSpending,DistinctCategories,Region_Europe,Region_North America,Region_South America,MostPurchasedCategory_Clothing,MostPurchasedCategory_Electronics,MostPurchasedCategory_Home Decor
0,0.308942,0.4,0.474336,0.666667,False,False,True,False,True,False
1,0.168095,0.3,0.30894,0.333333,False,False,False,True,False,False
2,0.249541,0.3,0.482751,0.666667,False,False,True,False,False,True
3,0.497806,0.7,0.473092,0.666667,False,False,True,False,False,False
4,0.184287,0.2,0.48012,0.333333,False,False,False,False,True,False


### Cosine Similarity Matrix

In [149]:
similarity_matrix = cosine_similarity(feature_matrix)
print(similarity_matrix[:5, :5])

[[1.         0.27467642 0.65018055 0.79260495 0.77987551]
 [0.27467642 1.         0.25994632 0.3708215  0.25588222]
 [0.65018055 0.25994632 1.         0.76918279 0.27979194]
 [0.79260495 0.3708215  0.76918279 1.         0.36905385]
 [0.77987551 0.25588222 0.27979194 0.36905385 1.        ]]


In [151]:
top_n = 3
lookalike_dict = {}

In [153]:
for i, customer_id in enumerate(final_data['CustomerID']):
    scores = similarity_matrix[i]
    scores[i] = -1

    top_indices = np.argsort(scores)[-top_n:][::-1]
 
    similar_customers = [(final_data['CustomerID'].iloc[idx], scores[idx]) for idx in top_indices]

    lookalike_dict[customer_id] = similar_customers

In [155]:
import pprint

In [157]:
pprint.pprint(dict(list(lookalike_dict.items())[:5]))

{'C0001': [('C0091', 0.9957792287495667),
           ('C0192', 0.9937621096989102),
           ('C0048', 0.986847042579986)],
 'C0002': [('C0134', 0.9943913144716523),
           ('C0092', 0.9808070911980618),
           ('C0106', 0.9685020818148578)],
 'C0003': [('C0152', 0.9977430428187105),
           ('C0031', 0.9957034978817269),
           ('C0158', 0.9929884473690259)],
 'C0004': [('C0155', 0.9966143151617873),
           ('C0165', 0.9965374289564791),
           ('C0126', 0.9901485237060832)],
 'C0005': [('C0186', 0.9981721462615398),
           ('C0007', 0.9938444546637496),
           ('C0140', 0.9840746983621196)]}


In [159]:
lookalike_list = [{'cust_id': customer_id, 'similar_customers': similar_customers} for customer_id, similar_customers in lookalike_dict.items()]

In [161]:
lookalike_list[:20]

[{'cust_id': 'C0001',
  'similar_customers': [('C0091', 0.9957792287495667),
   ('C0192', 0.9937621096989102),
   ('C0048', 0.986847042579986)]},
 {'cust_id': 'C0002',
  'similar_customers': [('C0134', 0.9943913144716523),
   ('C0092', 0.9808070911980618),
   ('C0106', 0.9685020818148578)]},
 {'cust_id': 'C0003',
  'similar_customers': [('C0152', 0.9977430428187105),
   ('C0031', 0.9957034978817269),
   ('C0158', 0.9929884473690259)]},
 {'cust_id': 'C0004',
  'similar_customers': [('C0155', 0.9966143151617873),
   ('C0165', 0.9965374289564791),
   ('C0126', 0.9901485237060832)]},
 {'cust_id': 'C0005',
  'similar_customers': [('C0186', 0.9981721462615398),
   ('C0007', 0.9938444546637496),
   ('C0140', 0.9840746983621196)]},
 {'cust_id': 'C0006',
  'similar_customers': [('C0187', 0.9950161911712916),
   ('C0085', 0.9901594443666802),
   ('C0011', 0.9849276543239699)]},
 {'cust_id': 'C0007',
  'similar_customers': [('C0115', 0.9950528575588955),
   ('C0140', 0.9942158186456764),
   ('C00

### Saving the First 20 Customer Lookalikes into an csv file

In [163]:
with open('NagaSaiThanmai_Pati_Lookalike.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['cust_id', 'similar_customers'])
    for row in lookalike_list[:20]:
        writer.writerow([row['cust_id'], row['similar_customers']])