# Lookalike Model

This notebook outlines the process of building a lookalike model that takes user's information as input and recommends 3 similar customers based on their profile and transaction history. The model will use both customer and product information, and assign a similarity score to each recommended customer.

## Steps in the Process:
1. **Data Loading and Exploration**: Load and explore the dataset to understand customer behavior.
2. **Feature Engineering**: Create relevant features to define customer similarities.
3. **Compute Similarity**: Use cosine similarity to identify customers with similar profiles.
4. **Generate Recommendations**: For selected customers, generate a list of similar customer profiles.
5. **Save Recommendations**: Export the recommendations for further analysis or application.

In [1]:
#importing necessary libraries
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import datetime as dt

## Load Datasets

In [2]:
# load datasets
customers = pd.read_csv('../datasets/customers.csv')
products = pd.read_csv('../datasets/products.csv')
transactions = pd.read_csv('../datasets/transactions.csv')

In [3]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [5]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 7:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


## Feature Engineering

In [6]:
# build content features
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
reference_date = pd.to_datetime(dt.date.today())
customers['Account_Age_Days'] = (reference_date - customers['SignupDate']).dt.days

In [7]:
# one-hot encode region features
region_dummies = pd.get_dummies(customers['Region'], prefix='Region')
customers = pd.concat([customers, region_dummies], axis=1)

customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,Account_Age_Days,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,Lawrence Carroll,South America,2022-07-10,968,False,False,False,True
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1115,True,False,False,False
2,C0003,Michael Rivera,South America,2024-03-07,362,False,False,False,True
3,C0004,Kathleen Rodriguez,South America,2022-10-09,877,False,False,False,True
4,C0005,Laura Weber,Asia,2022-08-15,932,True,False,False,False


In [8]:
# aggregate basic transaction features per customer
agg_trans = transactions.groupby('CustomerID').agg(
    Total_Transactions = ('TransactionID', 'count'),
    Total_Items_Purchased = ('Quantity', 'sum'),
    Total_Spend = ('TotalValue', 'sum'),
    Avg_Transaction_Value = ('TotalValue', 'mean'),
    Last_Purchase = ('TransactionDate', 'max')
    ).reset_index()

In [9]:
# Calculate recency (days since last purchase)
agg_trans['Recency_Days'] = (reference_date - pd.to_datetime(agg_trans['Last_Purchase'])).dt.days
agg_trans.head()

Unnamed: 0,CustomerID,Total_Transactions,Total_Items_Purchased,Total_Spend,Avg_Transaction_Value,Last_Purchase,Recency_Days
0,C0001,5,12,3354.52,670.904,2024-11-02 17:04:16,121
1,C0002,4,10,1862.74,465.685,2024-12-03 1:41:41,90
2,C0003,4,14,2725.38,681.345,2024-08-24 18:54:04,191
3,C0004,8,23,5354.88,669.36,2024-12-23 14:13:52,70
4,C0005,3,7,2034.24,678.08,2024-11-04 0:30:22,119


The pivot table is created to summarize transaction data by product category. This allows us to see how many items each customer purchased in each category.

In [10]:
# Category preference features
trans_prod = transactions.merge(products[['ProductID', 'Category']], on='ProductID', how='left')
cat_pivot = pd.pivot_table(trans_prod, 
                           index='CustomerID', 
                           columns='Category', 
                           values='Quantity', 
                           aggfunc='sum', fill_value=0).reset_index()

cat_pivot.head()

Category,CustomerID,Books,Clothing,Electronics,Home Decor
0,C0001,2,0,7,3
1,C0002,0,4,0,6
2,C0003,0,4,4,6
3,C0004,8,0,6,9
4,C0005,0,0,4,3


In [11]:
# merge all features to create full customer feature set
content = customers.merge(agg_trans, on='CustomerID', how='left').merge(cat_pivot, on='CustomerID', how='left')
content.fillna(0, inplace=True)

content.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,Account_Age_Days,Region_Asia,Region_Europe,Region_North America,Region_South America,Total_Transactions,Total_Items_Purchased,Total_Spend,Avg_Transaction_Value,Last_Purchase,Recency_Days,Books,Clothing,Electronics,Home Decor
0,C0001,Lawrence Carroll,South America,2022-07-10,968,False,False,False,True,5.0,12.0,3354.52,670.904,2024-11-02 17:04:16,121.0,2.0,0.0,7.0,3.0
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1115,True,False,False,False,4.0,10.0,1862.74,465.685,2024-12-03 1:41:41,90.0,0.0,4.0,0.0,6.0
2,C0003,Michael Rivera,South America,2024-03-07,362,False,False,False,True,4.0,14.0,2725.38,681.345,2024-08-24 18:54:04,191.0,0.0,4.0,4.0,6.0
3,C0004,Kathleen Rodriguez,South America,2022-10-09,877,False,False,False,True,8.0,23.0,5354.88,669.36,2024-12-23 14:13:52,70.0,8.0,0.0,6.0,9.0
4,C0005,Laura Weber,Asia,2022-08-15,932,True,False,False,False,3.0,7.0,2034.24,678.08,2024-11-04 0:30:22,119.0,0.0,0.0,4.0,3.0


In [12]:
# select feature columns
base_features = ['Account_Age_Days', 'Total_Transactions', 'Total_Items_Purchased', 'Total_Spend', 'Recency_Days']

region_features = [col for col in content.columns if col.startswith('Region_')]

category_features = list(cat_pivot.columns.drop('CustomerID'))

feature_cols = base_features + region_features + category_features
content_features = content[['CustomerID'] + feature_cols].copy()

content_features.head()

Unnamed: 0,CustomerID,Account_Age_Days,Total_Transactions,Total_Items_Purchased,Total_Spend,Recency_Days,Region_Asia,Region_Europe,Region_North America,Region_South America,Books,Clothing,Electronics,Home Decor
0,C0001,968,5.0,12.0,3354.52,121.0,False,False,False,True,2.0,0.0,7.0,3.0
1,C0002,1115,4.0,10.0,1862.74,90.0,True,False,False,False,0.0,4.0,0.0,6.0
2,C0003,362,4.0,14.0,2725.38,191.0,False,False,False,True,0.0,4.0,4.0,6.0
3,C0004,877,8.0,23.0,5354.88,70.0,False,False,False,True,8.0,0.0,6.0,9.0
4,C0005,932,3.0,7.0,2034.24,119.0,True,False,False,False,0.0,0.0,4.0,3.0


## Compute Similarity
Content-based filtering is used to generate recommendations based on the features of items or users rather than relying on user interactions or preferences

In [13]:
# compute content-based similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

scaler = StandardScaler()
feat = content_features.columns.drop('CustomerID')
scaled_features = scaler.fit_transform(content_features[feat])
content_sim = cosine_similarity(scaled_features)
content_sim_df = pd.DataFrame(content_sim, index=content_features['CustomerID'], columns=content_features['CustomerID'])

content_sim_df.head()

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,-0.044462,0.384633,0.480158,0.211053,0.127723,0.197302,-0.219682,-0.134063,-0.369266,...,0.264188,0.679828,-0.229182,-0.297421,0.167069,-0.170623,0.230117,0.054886,0.106871,-0.430541
C0002,-0.044462,1.0,-0.071887,-0.185539,0.762813,-0.233982,0.715066,-0.12675,0.143762,0.201014,...,-0.427132,-0.07323,0.101223,-0.370771,-0.05324,0.264383,0.117255,0.328704,0.284132,0.561415
C0003,0.384633,-0.071887,1.0,0.116931,-0.102985,0.481478,-0.005804,-0.011779,-0.078553,-0.166792,...,-0.010351,0.368762,-0.386084,-0.118588,0.764169,-0.326624,-0.083621,-0.154726,0.013536,-0.252293
C0004,0.480158,-0.185539,0.116931,1.0,-0.318612,0.176074,-0.286132,0.231179,-0.724479,-0.510934,...,0.292608,-0.070273,-0.133837,0.027894,0.419264,0.016855,-0.233069,-0.596312,-0.168406,0.00094
C0005,0.211053,0.762813,-0.102985,-0.318612,1.0,-0.398185,0.939504,-0.432702,0.301751,-0.098091,...,-0.264274,0.206699,0.23983,-0.393832,-0.316686,-0.040077,0.376705,0.431425,0.334481,0.230058


## Get recommendations
Using the similarity matrix, we generate a list of recommendations for each target customer, identifying the top similar customers.

In [14]:
# generate recommendations
recommendations = {}
target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)] # C0001, C0002, ... , C0020

for cust in target_customers:
    if cust in content_sim_df:
        sim_scores = content_sim_df.loc[cust].drop(cust)
        top_matches = sim_scores.sort_values(ascending=False).head(3)
        recommendations[cust] = [(match, round(score, 4)) for match, score in top_matches.items()]

## Save recommendations to Lookalike.csv

In [15]:
# Convert recommendations dict to a mapping of customer IDs to their list of (similar customer, score)
rec_map = {cust: matches for cust, matches in recommendations.items()}

rec_df = pd.DataFrame({
    'CustomerID': list(rec_map.keys()),
    'Recommendations': [str(rec_map[cust]) for cust in rec_map]
})

rec_df.to_csv('../Reports/Lookalike.csv', index=False)
