In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
#Merging the datasets for the deeper analysis
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')
print(merged_data.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [4]:
#Creating customer features like total spending,no.of transactions
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'nunique',
    'Quantity': 'sum'
}).reset_index()

In [5]:
#Standardize the features
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features[['TotalValue', 'TransactionID', 'Quantity']])

In [6]:
#Finding similarity between customers so we are using cosine similarity
similarity_matrix = cosine_similarity(customer_features_scaled)

In [7]:
# Function to get top 3 similar customers
def get_top_lookalikes(customer_id, similarity_matrix, top_n=3):
    customer_index = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    similarities = similarity_matrix[customer_index]
    top_indices = similarities.argsort()[-top_n-1:-1][::-1]
    top_customers = customer_features.iloc[top_indices]['CustomerID'].tolist()
    top_scores = similarities[top_indices].tolist()
    return list(zip(top_customers, top_scores))

In [8]:
#Finding the top 3 similar customers for the first 20 customers
lookalike_results = {}
for customer_id in customer_features['CustomerID'].head(20):
    lookalike_results[customer_id] = get_top_lookalikes(customer_id, similarity_matrix)

In [12]:
print("\nTop 3 similar Customers for the first 20 Customers:")
for customer_id, lookalikes in lookalike_results.items():
    print(f"Customer {customer_id}: {lookalikes}")


Top 3 similar Customers for the first 20 Customers:
Customer C0001: [('C0164', 0.9975983724345929), ('C0103', 0.9953941278099472), ('C0069', 0.9860733836106332)]
Customer C0002: [('C0029', 0.9997537292562625), ('C0031', 0.9989862771052472), ('C0077', 0.994312726345747)]
Customer C0003: [('C0176', 0.9029496971526534), ('C0027', 0.8751211681188946), ('C0010', 0.8329653020982932)]
Customer C0004: [('C0075', 0.9977891220712191), ('C0165', 0.9944415111703248), ('C0113', 0.993976488501688)]
Customer C0005: [('C0123', 0.9997808088942964), ('C0131', 0.9996277242039578), ('C0058', 0.9995605582180777)]
Customer C0006: [('C0079', 0.9998815163910635), ('C0117', 0.989524860418253), ('C0196', 0.9452522588894805)]
Customer C0007: [('C0125', 0.9980316823166858), ('C0140', 0.9979601864953596), ('C0092', 0.9979042974085808)]
Customer C0008: [('C0179', 0.9981992905794519), ('C0081', 0.9950291061097487), ('C0084', 0.9929308812562458)]
Customer C0009: [('C0192', 0.9987781413090145), ('C0083', 0.9957180831

In [9]:
lookalike_df = pd.DataFrame(lookalike_results).T
lookalike_df.to_csv('Lookalike.csv')