# Task 2:  Using KNN algorithm to find 3 LookAlike customers for first 20 customers

In [42]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [45]:
# Reading the relevant data according to the task
customers_df = pd.read_csv('C:\\Users\\palla\\Desktop\\Zeotap_intern\\Datasets\\Customers.csv') 
transactions_df = pd.read_csv('C:\\Users\\palla\\Desktop\\Zeotap_intern\\Datasets\\Transactions.csv')  
customers_df.head()


Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [46]:
# Merging the transaction data with the customer data to combine both profile and transaction info
merged_df = pd.merge(transactions_df, customers_df, on='CustomerID', how='inner')

In [47]:
print(merged_df.shape)
merged_df.head(10)

(1000, 10)


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15
5,T00442,C0188,P067,2024-12-26 14:40:03,1,300.68,300.68,Anna Ball,South America,2022-05-17
6,T00490,C0195,P067,2024-11-24 11:49:48,3,902.04,300.68,Jeremy Mclaughlin,South America,2024-09-17
7,T00536,C0008,P067,2024-09-22 06:13:59,1,300.68,300.68,David Li,North America,2024-01-13
8,T00564,C0157,P067,2024-12-07 17:57:40,3,902.04,300.68,Miguel Wong,North America,2024-01-30
9,T00631,C0130,P067,2024-05-14 23:14:59,2,601.36,300.68,Robert Jones,South America,2023-04-19


In [48]:
# Aggregating transaction data by CustomerID
agg_data = merged_df.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),  # Sum of TotalValue for each customer
    transaction_count=('TransactionID', 'count'),  # Counting the number of transactions for each customer
    avg_spending=('TotalValue', 'mean')  # Average spending per transaction for each customer
).reset_index()

# Displaying the aggregated data
agg_data.head()


Unnamed: 0,CustomerID,total_spending,transaction_count,avg_spending
0,C0001,3354.52,5,670.904
1,C0002,1862.74,4,465.685
2,C0003,2725.38,4,681.345
3,C0004,5354.88,8,669.36
4,C0005,2034.24,3,678.08


In [49]:
customer_profile = customers_df[['Region','CustomerID']]  
customer_data = pd.merge(agg_data, customer_profile, on='CustomerID', how='inner')
customer_data.head()

Unnamed: 0,CustomerID,total_spending,transaction_count,avg_spending,Region
0,C0001,3354.52,5,670.904,South America
1,C0002,1862.74,4,465.685,Asia
2,C0003,2725.38,4,681.345,South America
3,C0004,5354.88,8,669.36,South America
4,C0005,2034.24,3,678.08,Asia


In [50]:
#I am standardizing numerical features to ensure equal importance in distance calculations(for KNN)
scaler = StandardScaler()


In [51]:
numerical_features = ['total_spending', 'transaction_count', 'avg_spending']
customer_data[numerical_features] = scaler.fit_transform(customer_data[numerical_features])
customer_data.head()

Unnamed: 0,CustomerID,total_spending,transaction_count,avg_spending,Region
0,C0001,-0.061701,-0.011458,-0.070263,South America
1,C0002,-0.877744,-0.467494,-0.934933,Asia
2,C0003,-0.405857,-0.467494,-0.026271,South America
3,C0004,1.032547,1.35665,-0.076769,South America
4,C0005,-0.783929,-0.92353,-0.040028,Asia


In [52]:
# Initializing the K-Nearest Neighbors algorithm with K=4 (because KNN includes the customer itself)
knn = NearestNeighbors(n_neighbors=4, metric='euclidean')  # Here K=4 because we don't want to include the customer itself

In [53]:
X = customer_data[numerical_features]  # Using only numerical features for similarity calculation
knn.fit(X)

In [54]:
# Finding the 3 Nearest Neighbors for Each Customer
# For each customer, we find the indices of the top 3 nearest neighbors
neighbors = knn.kneighbors(X)  

In [66]:
# Creating a Lookalike Map with CustomerID, Nearest Customer IDs, and Similarity Scores
knn_lookalike_map = {}

for i, cust_id in enumerate(customer_data['CustomerID'][:20]): #Only for first 20 customers
    # Neighbors indices are the row indices in the customer_data DataFrame
    nearest_indices = neighbors[1][i, 1:]  # Skipping the first neighbor (itself)
    nearest_customers = customer_data.iloc[nearest_indices]
    
    # Calculating the similarity scores (converting distance to similarity: 1 / (1 + distance))
    similarities = 1 / (1 + neighbors[0][i, 1:]) 
    
    # Storing the customer ID, the nearest customer IDs, and their corresponding similarity scores
    knn_lookalike_map[cust_id] = list(zip(nearest_customers['CustomerID'], similarities))


In [67]:
#Saving the Results to 'Lookalike.csv'
# Creating a list to store the results
lookalike_list = []

# Extracting results from the lookalike map
for cust_id, similar_customers in knn_lookalike_map.items():
    for similar_cust_id, score in similar_customers:
        lookalike_list.append([cust_id, similar_cust_id, score])

# Converting the list into a DataFrame
lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Saving the DataFrame to 'Lookalike.csv'
lookalike_df.to_csv('Lookalike.csv', index=False)

In [68]:
#For sample view of Lookalikecsv
lookalike_df.head(10)

Unnamed: 0,CustomerID,LookalikeCustomerID,SimilarityScore
0,C0001,C0137,0.978404
1,C0001,C0152,0.969475
2,C0001,C0056,0.852966
3,C0002,C0029,0.91639
4,C0002,C0199,0.878485
5,C0002,C0031,0.855211
6,C0003,C0178,0.980735
7,C0003,C0035,0.859459
8,C0003,C0146,0.84497
9,C0004,C0021,0.952936
