In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")
transactions_df = pd.read_csv("Transactions.csv")

In [5]:
transactions_products = pd.merge(transactions_df, products_df, on='ProductID', how='inner')
full_data = pd.merge(transactions_products, customers_df, on='CustomerID', how='inner')

full_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00761,C0199,P022,2024-10-01 05:57:09,4,550.16,137.54,HomeSense Wall Art,Home Decor,137.54,Andrea Jenkins,Europe,2022-12-03
2,T00626,C0199,P079,2024-08-17 12:06:08,2,834.74,417.37,ActiveWear Rug,Home Decor,417.37,Andrea Jenkins,Europe,2022-12-03
3,T00963,C0199,P008,2024-10-26 00:01:58,2,293.7,146.85,BookWorld Bluetooth Speaker,Electronics,146.85,Andrea Jenkins,Europe,2022-12-03
4,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04


In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# Aggregating customer-level features
# Total revenue per customer
customer_revenue = full_data.groupby('CustomerID')['TotalValue'].sum()

# Average order value (AOV) per customer
customer_aov = full_data.groupby('CustomerID')['TotalValue'].mean()

# Total transactions per customer
customer_transactions = full_data.groupby('CustomerID').size()

In [8]:
# Proportion of purchases in each category
category_preferences = pd.pivot_table(
    full_data, 
    values='Quantity', 
    index='CustomerID', 
    columns='Category', 
    aggfunc='sum', 
    fill_value=0
)
category_proportions = category_preferences.div(category_preferences.sum(axis=1), axis=0)

In [9]:
# Merging aggregated features
customer_features = pd.concat([customer_revenue, customer_aov, customer_transactions, category_proportions], axis=1)
customer_features.columns = ['TotalRevenue', 'AverageOrderValue', 'TransactionCount'] + list(category_preferences.columns)

In [10]:
# Normalizing features
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features)
customer_features_scaled_df = pd.DataFrame(customer_features_scaled, index=customer_features.index, columns=customer_features.columns)

In [14]:
# Example input customer for similarity matching
example_customer_id = customer_features.index[1]
example_customer_vector = customer_features_scaled_df.loc[example_customer_id].values.reshape(1, -1)

# Calculating cosine similarity
similarity_scores = cosine_similarity(example_customer_vector, customer_features_scaled_df).flatten()

# Finding top 3 similar customers
top_similar_indices = similarity_scores.argsort()[-4:-1][::-1]  # Exclude the input customer itself
top_similar_customers = customer_features_scaled_df.iloc[top_similar_indices]
similarity_scores[top_similar_indices]

array([0.9092192 , 0.90747837, 0.89558195])

In [18]:
import csv

# Subset for the first 20 customers (C0001 - C0020)
selected_customers = customers_df[customers_df['CustomerID'].isin([f'C{str(i).zfill(4)}' for i in range(1, 21)])]
selected_customer_ids = selected_customers['CustomerID']

# Compute similarity scores for all selected customers
lookalike_map = {}

for customer_id in selected_customer_ids:
    # Get the vector for the current customer
    customer_vector = customer_features_scaled_df.loc[customer_id].values.reshape(1, -1)
    
    # Calculate cosine similarity
    similarity_scores = cosine_similarity(customer_vector, customer_features_scaled_df).flatten()
    
    # Exclude the customer itself and find the top 3 most similar customers
    similarity_scores_df = pd.DataFrame({
        'CustomerID': customer_features_scaled_df.index,
        'SimilarityScore': similarity_scores
    }).set_index('CustomerID')
    similarity_scores_df = similarity_scores_df.drop(index=customer_id).sort_values(by='SimilarityScore', ascending=False)
    
    top_3_similar = similarity_scores_df.head(3)
    lookalike_map[customer_id] = list(zip(top_3_similar.index, top_3_similar['SimilarityScore']))

# Create "Lookalike.csv" file
lookalike_file_path = 'Lookalike.csv'
with open(lookalike_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'Lookalikes'])  # Header
    
    for cust_id, lookalikes in lookalike_map.items():
        lookalikes_str = str([{l_id: round(score, 4)} for l_id, score in lookalikes])
        writer.writerow([cust_id, lookalikes_str])

lookalike_file_path

'Lookalike.csv'