In [2]:
import pandas as pd

In [3]:
# Loaded the datasets
customers = pd.read_csv("F:/ZEOTAP_DataScience_Assignment/Customers.csv")
products = pd.read_csv("F:/ZEOTAP_DataScience_Assignment/Products.csv")
transactions = pd.read_csv("F:/ZEOTAP_DataScience_Assignment/Transactions.csv")

In [4]:
# Merged datasets
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')
print(merged_data.head())

  TransactionID CustomerID ProductID TransactionDate  Quantity  TotalValue  \
0        T00001      C0199      P067  25-08-24 12:38         1      300.68   
1        T00112      C0146      P067  27-05-24 22:23         1      300.68   
2        T00166      C0127      P067   25-04-24 7:38         1      300.68   
3        T00272      C0087      P067  26-03-24 22:55         2      601.36   
4        T00363      C0070      P067  21-03-24 15:10         3      902.04   

   Price_x     CustomerName         Region  SignupDate  \
0   300.68   Andrea Jenkins         Europe  2022-12-03   
1   300.68  Brittany Harvey           Asia  2024-09-04   
2   300.68  Kathryn Stevens         Europe  2024-04-04   
3   300.68  Travis Campbell  South America  2024-04-11   
4   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker  Electronics   300.68  
2  Co

In [18]:
# Created total spending per customer and region
customer_spending = merged_data.groupby(['CustomerID','Region'])['TotalValue'].sum().reset_index()
print("Customer Spending Before Normalizing: \n", customer_spending.head())

# Normalizing the feature if required
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
customer_spending['TotalValue'] = scaler.fit_transform(customer_spending[['TotalValue']])
print("Customer Spending After Normalizing: \n", customer_spending.head())

Customer Spending Before Normalizing: 
   CustomerID         Region  TotalValue
0      C0001  South America     3354.52
1      C0002           Asia     1862.74
2      C0003  South America     2725.38
3      C0004  South America     5354.88
4      C0005           Asia     2034.24
Customer Spending After Normalizing: 
   CustomerID         Region  TotalValue
0      C0001  South America    0.308942
1      C0002           Asia    0.168095
2      C0003  South America    0.249541
3      C0004  South America    0.497806
4      C0005           Asia    0.184287


In [17]:
from sklearn.neighbors import NearestNeighbors

# Using KNN for finding similar customers
knn = NearestNeighbors(n_neighbors=3)
knn.fit(customer_spending[['TotalValue']])
distances, indices = knn.kneighbors(customer_spending[['TotalValue']])

# Created a DataFrame of lookalike customers and similarity scores
lookalike_df = pd.DataFrame({
    'CustomerID': customer_spending['CustomerID'],
    'Lookalike1': customer_spending.iloc[indices[:, 0]]['CustomerID'].values,
    'Lookalike2': customer_spending.iloc[indices[:, 1]]['CustomerID'].values,
    'Lookalike3': customer_spending.iloc[indices[:, 2]]['CustomerID'].values,
    'Similarity1': distances[:, 0],
    'Similarity2': distances[:, 1],
    'Similarity3': distances[:, 2]
})

# Saved the top 3 lookalikes with similarity scores for the first 20 customers
lookalike_df.head(20).to_csv('Soham_Nandy_Lookalike.csv', index=False)
