In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')




In [10]:
# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')
print(merged_data.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [11]:
#Feature Engineering
# Aggregate features for each customer
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',   # Total spending
    'Quantity': 'sum',     # Total quantity purchased
    'Region': lambda x: x.mode()[0],  # Most frequent region
    'Category': lambda x: ' '.join(x)  # Categories purchased
}).reset_index()

# Encode categorical variables (Region and Category)
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
category_matrix = vectorizer.fit_transform(customer_features['Category'])
category_df = pd.DataFrame(category_matrix.toarray(), columns=vectorizer.get_feature_names_out())
customer_features = pd.concat([customer_features, category_df], axis=1).drop('Category', axis=1)

print(customer_features.head())

  CustomerID  TotalValue  Quantity  Region_Europe  Region_North America  \
0      C0001     3354.52        12          False                 False   
1      C0002     1862.74        10          False                 False   
2      C0003     2725.38        14          False                 False   
3      C0004     5354.88        23          False                 False   
4      C0005     2034.24         7          False                 False   

   Region_South America  books  clothing  decor  electronics  home  
0                  True      1         0      1            3     1  
1                 False      0         2      2            0     2  
2                  True      0         1      2            1     2  
3                  True      3         0      3            2     3  
4                 False      0         0      1            2     1  


In [12]:
#Calculating Similarity

# Prepare feature matrix
feature_matrix = customer_features.drop('CustomerID', axis=1)

# Compute cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

print(similarity_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.999997  0.999998  0.999999  1.000000  0.999999   
C0002       0.999997  1.000000  0.999999  0.999998  0.999997  0.999996   
C0003       0.999998  0.999999  1.000000  0.999999  0.999998  0.999997   
C0004       0.999999  0.999998  0.999999  1.000000  0.999999  0.999999   
C0005       1.000000  0.999997  0.999998  0.999999  1.000000  0.999999   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       1.000000  0.999999  0.999997  0.999992  ...  1.000000  1.000000   
C0002       0.999996  0.999999  0.999995  0.999997  ...  0.999996  0.999997   
C0003       0.999998  1.000000  0.999995  0.999996  ...  0.999998  0.999998   
C0004       0.999999  1.000000  0.999996  0.999994  ...  0.999999  0.999999   
C0005  

In [14]:
# Generate recommendations for first 20 customers
recommendations = {}
for customer in customer_features['CustomerID'][:20]:
    similar_customers = similarity_df[customer].sort_values(ascending=False)[1:4]  # Top 3 excluding itself
    recommendations[customer] = [{'CustomerID': cust, 'Score': score} for cust, score in similar_customers.items()]

# Convert recommendations to DataFrame
recommendations_df = pd.DataFrame.from_dict(recommendations, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])
recommendations_df.index.name = 'CustomerID'
recommendations_df.reset_index(inplace=True)

# Save to CSV
recommendations_df.to_csv('Sudeep_V_Lookalike.csv', index=False)
print(recommendations_df.head())

  CustomerID                                         Lookalike1  \
0      C0001  {'CustomerID': 'C0005', 'Score': 0.99999986048...   
1      C0002  {'CustomerID': 'C0134', 'Score': 0.99999970988...   
2      C0003  {'CustomerID': 'C0031', 'Score': 0.99999989325...   
3      C0004  {'CustomerID': 'C0113', 'Score': 0.99999992457...   
4      C0005  {'CustomerID': 'C0007', 'Score': 0.99999990974...   

                                          Lookalike2  \
0  {'CustomerID': 'C0045', 'Score': 0.99999985983...   
1  {'CustomerID': 'C0043', 'Score': 0.99999955323...   
2  {'CustomerID': 'C0113', 'Score': 0.99999971528...   
3  {'CustomerID': 'C0017', 'Score': 0.99999990773...   
4  {'CustomerID': 'C0127', 'Score': 0.99999988591...   

                                          Lookalike3  
0  {'CustomerID': 'C0146', 'Score': 0.99999982811...  
1  {'CustomerID': 'C0031', 'Score': 0.99999950729...  
2  {'CustomerID': 'C0086', 'Score': 0.99999971361...  
3  {'CustomerID': 'C0039', 'Score': 0.99