In [2]:
import pandas as pb;
from sklearn.preprocessing import MinMaxScaler;
from sklearn.metrics.pairwise import cosine_similarity;

In [3]:
Customers = pb.read_csv('Customers.csv')
Products = pb.read_csv('Products.csv')
Transactions = pb.read_csv('Transactions.csv')

In [4]:
Transactions['TransactionDate'] = pb.to_datetime(Transactions['TransactionDate'])
Customers['SignupDate'] = pb.to_datetime(Customers['SignupDate'])

In [5]:
CustomerTransactions = Transactions.merge(Customers,on='CustomerID',how='inner')
print(CustomerTransactions.head(5))

  TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0        T00001      C0199      P067 2024-08-25 12:38:23         1   
1        T00112      C0146      P067 2024-05-27 22:23:54         1   
2        T00166      C0127      P067 2024-04-25 07:38:55         1   
3        T00272      C0087      P067 2024-03-26 22:55:37         2   
4        T00363      C0070      P067 2024-03-21 15:10:10         3   

   TotalValue   Price     CustomerName         Region SignupDate  
0      300.68  300.68   Andrea Jenkins         Europe 2022-12-03  
1      300.68  300.68  Brittany Harvey           Asia 2024-09-04  
2      300.68  300.68  Kathryn Stevens         Europe 2024-04-04  
3      601.36  300.68  Travis Campbell  South America 2024-04-11  
4      902.04  300.68    Timothy Perez         Europe 2022-03-15  


In [6]:
CustomerTransactions = Transactions.merge(Products,on='ProductID',how='left')
print(CustomerTransactions.head(5))

  TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0        T00001      C0199      P067 2024-08-25 12:38:23         1   
1        T00112      C0146      P067 2024-05-27 22:23:54         1   
2        T00166      C0127      P067 2024-04-25 07:38:55         1   
3        T00272      C0087      P067 2024-03-26 22:55:37         2   
4        T00363      C0070      P067 2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68  
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68  
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68  
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68  


In [7]:
TotalSpending = CustomerTransactions.groupby('CustomerID')['TotalValue'].sum()
AverageTransaction = CustomerTransactions.groupby('CustomerID')['TotalValue'].mean().rename('AverageTransaction')
NumberOfTransactions = CustomerTransactions.groupby('CustomerID')['TransactionID'].count()

MostPurchasedCategory = (CustomerTransactions.groupby(['CustomerID', 'Category'])['TransactionID'].count().reset_index().sort_values(['CustomerID', 'TransactionID'], ascending=[True, False]).groupby('CustomerID').first()['Category']).rename('MostPurchasedCategory')

In [8]:
CustomerFeatures = pb.concat([TotalSpending, AverageTransaction, NumberOfTransactions, MostPurchasedCategory], axis=1)
print(CustomerFeatures.head())

            TotalValue  AverageTransaction  TransactionID  \
CustomerID                                                  
C0001          3354.52             670.904              5   
C0002          1862.74             465.685              4   
C0003          2725.38             681.345              4   
C0004          5354.88             669.360              8   
C0005          2034.24             678.080              3   

           MostPurchasedCategory  
CustomerID                        
C0001                Electronics  
C0002                   Clothing  
C0003                 Home Decor  
C0004                      Books  
C0005                Electronics  


In [9]:
NumericalColumns = ['TotalValue', 'AverageTransaction', 'TransactionID']
scaler = MinMaxScaler()

In [10]:
NormalizedData = scaler.fit_transform(CustomerFeatures[NumericalColumns])
NormalizedFeatures = pb.DataFrame(NormalizedData, columns=[f"Normalized_{col}" for col in NumericalColumns], index=CustomerFeatures.index)

CustomerFeaturesNormalized = CustomerFeatures.copy()
CustomerFeaturesNormalized = pb.concat([CustomerFeaturesNormalized,NormalizedFeatures], axis=1)

In [12]:
NumericalFeaturesNormalized = CustomerFeaturesNormalized.filter(like="Normalized_").values
SimilarityMatrix = cosine_similarity(NumericalFeaturesNormalized)

SimilarityDataFrame = pb.DataFrame(SimilarityMatrix, index=CustomerFeaturesNormalized.index, columns=CustomerFeaturesNormalized.index)

TopLookalikesFirst20 = {}
First20Customers = CustomerFeaturesNormalized.index[:20]

for CustomerId in First20Customers:
    SimilarCustomers = SimilarityDataFrame.loc[CustomerId].sort_values(ascending=False).iloc[1:4]
    TopLookalikesFirst20[CustomerId] = [{"CustomerID": OtherId, "Score": round(score, 2)} for OtherId, score in SimilarCustomers.items()]

LookalikeDataFirst20 = [{"CustomerID": Customer, "Lookalikes": str(TopLookalikesFirst20[Customer])} for Customer in TopLookalikesFirst20]
LookalikeDataFrameFirst20 = pb.DataFrame(LookalikeDataFirst20)

outputPathFirst20 = "Pratyush_Singh_Lookalike.csv"
LookalikeDataFrameFirst20.to_csv(outputPathFirst20, index=False)