In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
#load datasets

In [4]:
customers = pd.read_csv(r"C:\Users\rakeek mirza\OneDrive\Desktop\zephr\zephyrprojects\Zeotap\Customers.csv")
customers

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15
...,...,...,...,...
195,C0196,Laura Watts,Europe,2022-06-07
196,C0197,Christina Harvey,Europe,2023-03-21
197,C0198,Rebecca Ray,Europe,2022-02-27
198,C0199,Andrea Jenkins,Europe,2022-12-03


In [5]:
products = pd.read_csv(r"C:\Users\rakeek mirza\OneDrive\Desktop\zephr\zephyrprojects\Zeotap\Products.csv")
products

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.30
1,P002,ActiveWear Smartwatch,Electronics,346.30
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31
...,...,...,...,...
95,P096,SoundWave Headphones,Electronics,307.47
96,P097,BookWorld Cookbook,Books,319.34
97,P098,SoundWave Laptop,Electronics,299.93
98,P099,SoundWave Mystery Book,Books,354.29


In [6]:
transactions = pd.read_csv(r"C:\Users\rakeek mirza\OneDrive\Desktop\zephr\zephyrprojects\Zeotap\Transactions.csv")
transactions

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68
...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86


In [7]:
#merging datasets

In [8]:
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [9]:
#feature engineering

In [10]:
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Category': lambda x: ' '.join(x),
    'Region': 'first',
    'SignupDate': 'first'
}).reset_index()

In [None]:
#Encode categorical features using labelencoder()

In [11]:
encoder = LabelEncoder()
customer_features['Region'] = encoder.fit_transform(customer_features['Region'])
customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate']).map(pd.Timestamp.toordinal)

In [12]:
#normalize numeric data using standardscaler()

In [13]:
scaler = StandardScaler()
numeric_features = ['TotalValue', 'Quantity', 'SignupDate']
customer_features[numeric_features] = scaler.fit_transform(customer_features[numeric_features])

In [None]:
#Creating customer similarity matrix

In [14]:
features = ['TotalValue', 'Quantity', 'Region', 'SignupDate']
similarity_matrix = cosine_similarity(customer_features[features])

In [None]:
#top 3 lookalikes for each customer.

In [16]:
lookalike_results = {}
for idx, customer_id in enumerate(customer_features['CustomerID']):
    similarities = list(enumerate(similarity_matrix[idx]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_features.loc[i, 'CustomerID'], score) for i, score in similarities[1:4]]
    lookalike_results[customer_id] = top_3

In [17]:
#getting results for first 20 customers
lookalike_subset = {k: v for k, v in lookalike_results.items() if k in customers['CustomerID'][:20].values}

In [None]:
#saving 

In [18]:
output_data = []
for cust_id, lookalikes in lookalike_subset.items():
    for lookalike_id, score in lookalikes:
        output_data.append({'cust_id': cust_id, 'lookalike_cust_id': lookalike_id, 'score': score})

lookalike_df = pd.DataFrame(output_data)
lookalike_df.to_csv('Lookalike.csv', index=False)