# Lookalike Model

## This notebook outlines the process of building a lookalike model that takes user's information as input and recommends 3 similar customers based on their profile and transaction history. The model will use both customer and product information, and assign a similarity score to each recommended customer.

In [1]:
#importing necessary libraries
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np

## Load Datasets

In [2]:
# load datasets
customers = pd.read_csv('../datasets/customers.csv')
products = pd.read_csv('../datasets/products.csv')
transactions = pd.read_csv('../datasets/transactions.csv')

In [3]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [5]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 7:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


## Feature Engineering

### Customer Profile features

In [6]:
# One-hot encode Region
region_dummies = pd.get_dummies(customers['Region'], prefix='Region')
customers_encoded = pd.concat([customers, region_dummies], axis=1)
customers_encoded.drop('Region', axis=1, inplace=True)
# customers_encoded.head()

In [7]:
# customers signupdate to datetime
customers_encoded['SignupDate'] = pd.to_datetime(customers_encoded['SignupDate'])
# extract feature
customers_encoded['Account_Age_Days'] = (pd.Timestamp.now() - customers_encoded['SignupDate']).dt.days
# customers_encoded.head()

### Transaction-based features

In [8]:
# aggregate transaction metrics
transaction_features = transactions.groupby('CustomerID').agg({
    'TransactionID': 'count',
    'Quantity': ['sum', 'mean', 'std'],
    'TotalValue': ['sum', 'mean', 'std'],
    'TransactionDate': ['min', 'max']
}).round(2)

# flatten column names
transaction_features.columns = ['_'.join(col).strip() for col in transaction_features.columns.values]
# transaction_features.head()

# rename columns
transaction_features= transaction_features.rename(columns={
    'TransactionID_count': 'Total_Transactions',
    'Quantity_sum': 'Total_Items_Purchased',
    'Quantity_mean': 'Avg_Items_Per_Transaction',
    'Quantity_std': 'Std_Items_Per_Transaction',
    'TotalValue_sum': 'Total_Spend',
    'TotalValue_mean': 'Avg_Transaction_Value',
    'TotalValue_std': 'Std_Transaction_Value',
    'TransactionDate_min': 'First_Purchase_Date',
    'TransactionDate_max': 'Last_Purchase_Date'
})

In [9]:
# convert date columns to datetime
transaction_features['First_Purchase_Date'] = pd.to_datetime(transaction_features['First_Purchase_Date'])
transaction_features['Last_Purchase_Date'] = pd.to_datetime(transaction_features['Last_Purchase_Date'])

# calculate days between first and last purchase
transaction_features['Purchase_Timespan_Days'] = (transaction_features['Last_Purchase_Date'] - transaction_features['First_Purchase_Date']).dt.days

transaction_features.head()

Unnamed: 0_level_0,Total_Transactions,Total_Items_Purchased,Avg_Items_Per_Transaction,Std_Items_Per_Transaction,Total_Spend,Avg_Transaction_Value,Std_Transaction_Value,First_Purchase_Date,Last_Purchase_Date,Purchase_Timespan_Days
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C0001,5,12,2.4,0.55,3354.52,670.9,456.64,2024-01-19 03:12:55,2024-11-02 17:04:16,288
C0002,4,10,2.5,1.0,1862.74,465.68,219.52,2024-02-28 07:44:21,2024-12-03 01:41:41,278
C0003,4,14,3.5,0.58,2725.38,681.34,559.28,2024-02-18 02:50:37,2024-08-24 18:54:04,188
C0004,8,23,2.88,1.13,5354.88,669.36,325.39,2024-02-28 10:16:35,2024-12-23 14:13:52,299
C0005,3,7,2.33,0.58,2034.24,678.08,310.82,2024-03-15 04:08:59,2024-11-04 00:30:22,233


### Product Category Features

In [10]:
trans_products = transactions.merge(products, on='ProductID', how='left')
# trans_products.head()
trans_products.drop('Price_x', axis=1, inplace=True)

In [11]:
trans_products=trans_products.rename(columns={
    'Price_y': 'Price'
})
# trans_products.head()

In [12]:
category_features = pd.get_dummies(trans_products['Category'])
category_features = category_features.mul(trans_products['Quantity'], axis=0)
category_features = category_features.groupby(trans_products['CustomerID']).sum()
category_features = category_features.add_prefix('Qty_Category_')
# category_features.head()

In [13]:
# merge all features
final_features = customers_encoded.merge(transaction_features, left_on='CustomerID', right_index=True, how='left')
final_features = final_features.merge(category_features, left_on='CustomerID', right_index=True, how='left')

# fill NaN values with 0 for customers with no transactions
final_features = final_features.fillna(0)

In [14]:
print("\nFinal Feature set shape: ", final_features.shape)
print("\nFeatures created: ", final_features.columns.tolist())
final_features.head()


Final Feature set shape:  (200, 22)

Features created:  ['CustomerID', 'CustomerName', 'SignupDate', 'Region_Asia', 'Region_Europe', 'Region_North America', 'Region_South America', 'Account_Age_Days', 'Total_Transactions', 'Total_Items_Purchased', 'Avg_Items_Per_Transaction', 'Std_Items_Per_Transaction', 'Total_Spend', 'Avg_Transaction_Value', 'Std_Transaction_Value', 'First_Purchase_Date', 'Last_Purchase_Date', 'Purchase_Timespan_Days', 'Qty_Category_Books', 'Qty_Category_Clothing', 'Qty_Category_Electronics', 'Qty_Category_Home Decor']


Unnamed: 0,CustomerID,CustomerName,SignupDate,Region_Asia,Region_Europe,Region_North America,Region_South America,Account_Age_Days,Total_Transactions,Total_Items_Purchased,...,Total_Spend,Avg_Transaction_Value,Std_Transaction_Value,First_Purchase_Date,Last_Purchase_Date,Purchase_Timespan_Days,Qty_Category_Books,Qty_Category_Clothing,Qty_Category_Electronics,Qty_Category_Home Decor
0,C0001,Lawrence Carroll,2022-07-10,False,False,False,True,965,5.0,12.0,...,3354.52,670.9,456.64,2024-01-19 03:12:55,2024-11-02 17:04:16,288.0,2.0,0.0,7.0,3.0
1,C0002,Elizabeth Lutz,2022-02-13,True,False,False,False,1112,4.0,10.0,...,1862.74,465.68,219.52,2024-02-28 07:44:21,2024-12-03 01:41:41,278.0,0.0,4.0,0.0,6.0
2,C0003,Michael Rivera,2024-03-07,False,False,False,True,359,4.0,14.0,...,2725.38,681.34,559.28,2024-02-18 02:50:37,2024-08-24 18:54:04,188.0,0.0,4.0,4.0,6.0
3,C0004,Kathleen Rodriguez,2022-10-09,False,False,False,True,874,8.0,23.0,...,5354.88,669.36,325.39,2024-02-28 10:16:35,2024-12-23 14:13:52,299.0,8.0,0.0,6.0,9.0
4,C0005,Laura Weber,2022-08-15,True,False,False,False,929,3.0,7.0,...,2034.24,678.08,310.82,2024-03-15 04:08:59,2024-11-04 00:30:22,233.0,0.0,0.0,4.0,3.0
