# Lookalike Model

## This notebook outlines the process of building a lookalike model that takes user's information as input and recommends 3 similar customers based on their profile and transaction history. The model will use both customer and product information, and assign a similarity score to each recommended customer.

In [1]:
#importing necessary libraries
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np

## Load Datasets

In [2]:
# load datasets
customers = pd.read_csv('../datasets/customers.csv')
products = pd.read_csv('../datasets/products.csv')
transactions = pd.read_csv('../datasets/transactions.csv')

## Feature Engineering

In [3]:
# One-hot encode the low-dimensional categorical features in customers DataFrame
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
region_encoded = encoder.fit_transform(customers[['Region']])
region_encoded_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(['Region']))
customers_encoded = pd.concat([customers[['CustomerID', 'Region']], region_encoded_df], axis=1)
customers_encoded.drop(['Region'], axis=1, inplace=True)
# print(customers_encoded)

In [4]:
# Aggregate transaction data
transaction_summary = transactions.groupby('CustomerID').agg(
    TotalProductsPurchased = ('Quantity', 'sum'),
    TotalValue = ('TotalValue', 'sum'),
    TransactionsCount = ('CustomerID', 'count')
).reset_index()
# print(transaction_summary)

# Merge transaction summary with customers_encoded
customers_df = pd.merge(customers_encoded, transaction_summary, on='CustomerID', how='left')
customers_df.head()

Unnamed: 0,CustomerID,Region_Asia,Region_Europe,Region_North America,Region_South America,TotalProductsPurchased,TotalValue,TransactionsCount
0,C0001,0.0,0.0,0.0,1.0,12.0,3354.52,5.0
1,C0002,1.0,0.0,0.0,0.0,10.0,1862.74,4.0
2,C0003,0.0,0.0,0.0,1.0,14.0,2725.38,4.0
3,C0004,0.0,0.0,0.0,1.0,23.0,5354.88,8.0
4,C0005,1.0,0.0,0.0,0.0,7.0,2034.24,3.0


In [5]:
# Merge products data with transactions
products_transactions = pd.merge(transactions, products, on='ProductID', how='left')
# products_transactions.head()

product_summary = products_transactions.groupby('CustomerID').agg(
    MostCommonProduct = ('ProductID', lambda x: x.mode()[0]),
    UniqueCategoriesPurchased = ('Category', lambda x: x.unique())
).reset_index()
product_summary.head()

Unnamed: 0,CustomerID,MostCommonProduct,UniqueCategoriesPurchased
0,C0001,P022,"[Books, Home Decor, Electronics]"
1,C0002,P004,"[Home Decor, Clothing]"
2,C0003,P002,"[Home Decor, Clothing, Electronics]"
3,C0004,P008,"[Books, Home Decor, Electronics]"
4,C0005,P012,"[Home Decor, Electronics]"


In [6]:
# label ecoding
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['MostCommonProduct', 'UniqueCategoriesPurchased']
label_encoders = {}

le = LabelEncoder()
# encode MostCommonProduct
product_summary['MostCommonProduct'] = le.fit_transform(product_summary['MostCommonProduct'])
label_encoders['MostCommonProduct'] = le

print(label_encoders['MostCommonProduct'])

# convert object data type to iterable data type
product_summary['UniqueCategoriesPurchased'] = product_summary['UniqueCategoriesPurchased'].apply(np.array)
product_summary['UniqueCategoriesPurchased'] = product_summary['UniqueCategoriesPurchased'].apply(lambda x: str(x))

# encode UniqueCategoriesPurchased
product_summary["UniqueCategoriesPurchased"] = le.fit_transform(product_summary["UniqueCategoriesPurchased"])
label_encoders['UniqueCategoriesPurchased'] = le
product_summary.head()

LabelEncoder()


Unnamed: 0,CustomerID,MostCommonProduct,UniqueCategoriesPurchased
0,C0001,21,13
1,C0002,3,51
2,C0003,1,50
3,C0004,7,13
4,C0005,11,55


In [7]:
# merge product_summary with customers_df
df = pd.merge(customers_df, product_summary, on='CustomerID', how='left')  
df.head()

Unnamed: 0,CustomerID,Region_Asia,Region_Europe,Region_North America,Region_South America,TotalProductsPurchased,TotalValue,TransactionsCount,MostCommonProduct,UniqueCategoriesPurchased
0,C0001,0.0,0.0,0.0,1.0,12.0,3354.52,5.0,21.0,13.0
1,C0002,1.0,0.0,0.0,0.0,10.0,1862.74,4.0,3.0,51.0
2,C0003,0.0,0.0,0.0,1.0,14.0,2725.38,4.0,1.0,50.0
3,C0004,0.0,0.0,0.0,1.0,23.0,5354.88,8.0,7.0,13.0
4,C0005,1.0,0.0,0.0,0.0,7.0,2034.24,3.0,11.0,55.0


## Embedding Model 

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


# convert the dataframe to tensor
customer_features = df.drop(['CustomerID'], axis=1).values
customer_tensor = torch.tensor(customer_features, dtype=torch.float32)

# define autoencoder model
class SimpleAutoencoder(nn.Module):  
    def __init__(self, input_dim, embedding_dim):  
        super(SimpleAutoencoder, self).__init__()  
        self.encoder = nn.Linear(input_dim, embedding_dim)  
        self.decoder = nn.Linear(embedding_dim, input_dim)  

    def forward(self, x):  
        encoded = self.encoder(x)  
        decoded = self.decoder(encoded)  
        return encoded, decoded 

In [12]:
# initialize the model
embedding_dim = 128
autoencoder_model = SimpleAutoencoder(input_dim=customer_features.shape[1]-1, embedding_dim=embedding_dim)  

## Indexing and search

In [11]:
import faiss

# extract customer features
customer_features = df.drop(['CustomerID'], axis=1).values
print(customer_features)

# # generate embeddings
# model.eval()

# with torch.no_grad():
#     customer_embeddings = model.tower1(customer_tensor).numpy()

# normalize embeddings to unit length
customer_embeddings = customer_embeddings / np.linalg.norm(customer_embeddings, axis=1, keepdims=True)

# build FAISS index
index = faiss.IndexFlatL2(embedding_dim)
index.add(customer_embeddings)



[[ 0.  0.  0. ...  5. 21. 13.]
 [ 1.  0.  0. ...  4.  3. 51.]
 [ 0.  0.  0. ...  4.  1. 50.]
 ...
 [ 0.  1.  0. ...  2. 54. 37.]
 [ 0.  1.  0. ...  4.  7. 42.]
 [ 1.  0.  0. ...  5. 32. 21.]]


NameError: name 'customer_embeddings' is not defined




• Model Training: Train the embedding model using the prepared features. The model should ingest multi-modal features from different data sources to improve quality. A feed-forward network composed of multiple fully connected ReLU layers can output a 128-dimensional embedding as a customer representation. 
• Lookalike Customer Search: Transform the input user's information into an embedding vector using the trained model. Search the customer embedding space using the pre-built FAISS index to find the nearest neighbours (lookalike customers).... 
• Ranking and Filtering: Rank the lookalike customers based on their similarity scores (cosine distance) to the input user12.... Filter the results based on business rules or a separate classification model if needed12. 
• Output: Recommend the top three most similar customers along with their corresponding similarity scores. Additional considerations:
 • Model Evaluation: Evaluate the model using metrics like F1-score and PR-AUC, which are more suitable for imbalanced datasets than accuracy or ROC-AUC17.
 • Interpretability: Ensure the similarity metrics are interpretable and meaningful for business partners The similarity metrics we define can be very useful in improving and explaining marketing campaign performance
- Use Jaccard similarity
- make sure you use customer data and products data, if needed transaction data as well
- Give me just the code, if explaination needed, keep it to a minimum
- There should be placeholder code in your code