In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import csv

# Data Preprocessing

In [28]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [29]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Feature Engineering

In [3]:
customer_profiles = customers[['CustomerID', 'Region']]
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'])

In [30]:
current_date = transactions['TransactionDate'].max()
customers['SignupRecency'] = (current_date - customers['SignupDate']).dt.days
customer_profiles = customer_profiles.merge(customers[['CustomerID', 'SignupRecency']], on='CustomerID')

In [33]:
#Transaction features insights
customer_spend = transactions.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_spend.rename(columns={'TotalValue': 'TotalSpend'}, inplace=True)
customer_spend.head()

Unnamed: 0,CustomerID,TotalSpend
0,C0001,3354.52
1,C0002,1862.74
2,C0003,2725.38
3,C0004,5354.88
4,C0005,2034.24


In [34]:
customer_frequency = transactions.groupby('CustomerID').size().reset_index(name='PurchaseFrequency')
customer_frequency.head()

Unnamed: 0,CustomerID,PurchaseFrequency
0,C0001,5
1,C0002,4
2,C0003,4
3,C0004,8
4,C0005,3


In [35]:
customer_avg_order = transactions.groupby('CustomerID')['TotalValue'].mean().reset_index()
customer_avg_order.rename(columns={'TotalValue': 'AvgOrderValue'}, inplace=True)
customer_avg_order.head()

Unnamed: 0,CustomerID,AvgOrderValue
0,C0001,670.904
1,C0002,465.685
2,C0003,681.345
3,C0004,669.36
4,C0005,678.08


In [37]:
customer_categories = transactions_products.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0)
customer_categories.reset_index(inplace=True)
customer_categories.head()

Category,CustomerID,Books,Clothing,Electronics,Home Decor
0,C0001,1,0,3,1
1,C0002,0,2,0,2
2,C0003,0,1,1,2
3,C0004,3,0,2,3
4,C0005,0,0,2,1


In [38]:
# Combine Features

customer_features = customer_profiles.copy()

# Merge transaction features
customer_features = customer_features.merge(customer_spend, on='CustomerID', how='left')
customer_features = customer_features.merge(customer_frequency, on='CustomerID', how='left')
customer_features = customer_features.merge(customer_avg_order, on='CustomerID', how='left')

# Merge categories
customer_features = customer_features.merge(customer_categories, on='CustomerID', how='left')

# Replace NaN values with zeros
customer_features.fillna(0, inplace=True)

# Set CustomerID as index
customer_features.set_index('CustomerID', inplace=True)

# Data Normalization

In [40]:
scaler = MinMaxScaler()
customer_features_scaled = pd.DataFrame(scaler.fit_transform(customer_features), columns=customer_features.columns, index=customer_features.index)

## Calculating cosine similarity between customers

In [47]:
# Compute the similarity matrix
similarity_matrix = pd.DataFrame(cosine_similarity(customer_features_scaled), index=customer_features_scaled.index, columns=customer_features_scaled.index)

## Get the top 3 Lookalikes 

In [42]:
# 6. Generate Top 3 Lookalikes for Each Customer
customer_ids = ['C{:04d}'.format(i) for i in range(1, 21)]
lookalike_map = {}

for cust_id in customer_ids:
    if cust_id in similarity_matrix.index:
        sim_scores = similarity_matrix.loc[cust_id]
        sim_scores = sim_scores.drop(index=cust_id)
        top_similars = sim_scores.nlargest(3)
        similars_list = list(top_similars.items())
        lookalike_map[cust_id] = similars_list
    else:
        lookalike_map[cust_id] = []

In [48]:
# Open the CSV file for writing
with open('Varun_Goyal_Lookalike.csv', 'w', newline='') as csvfile:
    fieldnames = ['CustomerID', 'Lookalikes']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for cust_id, similars in lookalike_map.items():
        # Convert list of tuples to string representation
        similars_str = str(similars)
        writer.writerow({'CustomerID': cust_id, 'Lookalikes': similars_str})

In [49]:
# Example: Get lookalikes for a specific customer For eg: C0001
cust_id = 'C0001'
if cust_id in lookalike_map:
    similars = lookalike_map[cust_id]
    print(f"Top 3 lookalikes for {cust_id}:")
    for similar_cust_id, score in similars:
        print(f"CustomerID: {similar_cust_id}, Similarity Score: {score:.4f}")
else:
    print(f"{cust_id} not found in the customer data.")

Top 3 lookalikes for C0001:
CustomerID: C0112, Similarity Score: 0.9837
CustomerID: C0192, Similarity Score: 0.9806
CustomerID: C0118, Similarity Score: 0.9780


In [50]:
# Compare profiles
customer_features.loc[[cust_id] + [s[0] for s in similars]]

Unnamed: 0_level_0,Region_Asia,Region_Europe,Region_North America,Region_South America,SignupRecency_x,SignupRecency_y,TotalSpend,PurchaseFrequency,AvgOrderValue,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
C0001,False,False,False,True,902,902,3354.52,5.0,670.904,1.0,0.0,3.0,1.0
C0112,False,False,False,True,905,905,1959.51,3.0,653.17,1.0,0.0,2.0,0.0
C0192,False,False,False,True,835,835,2072.72,4.0,518.18,1.0,1.0,2.0,0.0
C0118,False,False,False,True,1071,1071,3434.77,6.0,572.461667,2.0,1.0,2.0,1.0
