In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

Load data sets and merge them into a single dataframe. Join Transactions and Products on Product ID. Then join this dataframe with Customers on Customer ID.

In [None]:

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


In [None]:

print("Customers Data:")
print(customers.head())
print("\nProducts Data:")
print(products.head())
print("\nTransactions Data:")
print(transactions.head())

Customers Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products Data:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions Data:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127 

In [None]:

transaction_product_data = pd.merge(transactions, products, on='ProductID')
merged_data = pd.merge(transaction_product_data, customers, on='CustomerID')


In [None]:
merged_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


Price column is present in both Products and Transactions csv, so we have two columns Price_X and Price_y. We will drop price_y column and rename price_x to Price.

In [None]:
merged_data = merged_data.drop(['Price_y'], axis=1)


In [None]:
merged_data = merged_data.rename(columns={'Price_x': 'Price'})


#Aggregate Features for Each Customer
First, we will calculate numerical features.
Lookalike customers will have similar spending power. So we will check similarity of look alike customers on the basis of The total amount spent by the customers, the quantity of items purchased, the average product prices and their standard deviation.

In [None]:

customer_numerical_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',           # Total spending
    'Quantity': 'sum',             # Total quantity purchased
    'Price': ['mean', 'std'],      # Average and standard deviation of product prices
}).reset_index()
customer_numerical_features.columns = ['CustomerID', 'TotalValue', 'Quantity', 'AvgPrice', 'PriceStd']

customer_numerical_features['PriceStd'] = customer_numerical_features['PriceStd'].fillna(0)

Look alike customers will have interest in similar categories. Hence, we will calculate the the category distribution of customers by normalizing them to get proportions.

In [None]:
category_distribution = pd.crosstab(merged_data['CustomerID'], merged_data['Category'], values=merged_data['Quantity'], aggfunc='sum').fillna(0)

category_distribution = category_distribution.div(category_distribution.sum(axis=1), axis=0).fillna(0)

Even in same category purchases, we will have customers who have purchased the same product. We will give a bit more preference to such customers, as they have similar product preferences also.

In [None]:
# Product overlap matrix (customers who bought the same products)
product_matrix = pd.crosstab(merged_data['CustomerID'], merged_data['ProductID'])
product_similarity = cosine_similarity(product_matrix)

Calculate days since last purchase for each customer as similar customers will have similar purchase history.


In [None]:
merged_data['TransactionDate'] = pd.to_datetime(merged_data['TransactionDate'])
last_purchase = merged_data.groupby('CustomerID')['TransactionDate'].max().reset_index()
last_purchase['Recency'] = (merged_data['TransactionDate'].max() - last_purchase['TransactionDate']).dt.days
last_purchase = last_purchase[['CustomerID', 'Recency']]

Merge all these features.

In [None]:
customer_features = category_distribution.join(customer_numerical_features.set_index('CustomerID'))
customer_features = customer_features.join(last_purchase.set_index('CustomerID'))

Normalize numerical features


In [None]:
scaler = StandardScaler()
numerical_cols = ['TotalValue', 'Quantity', 'AvgPrice', 'PriceStd', 'Recency']
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

In [None]:
product_similarity_df = pd.DataFrame(product_similarity, index=category_distribution.index, columns=category_distribution.index)
customer_features = customer_features.join(product_similarity_df)

Assign Weights
Assign appropriate weights to each feature group we have added in above steps. First, we will give equal importance to a customer's spending power and their interested categories (0.5 each). But even among similar interested categories, customers who have bought the same product should be given a bit more importance. Hence, we will give some weight to same product bought also

In [None]:

category_weight = 0.4 / len(category_distribution.columns)
numerical_weight = 0.5 / len(numerical_cols)
product_weight = 0.1 / product_similarity_df.shape[1]

weights = [category_weight] * len(category_distribution.columns) + \
          [numerical_weight] * len(numerical_cols) + \
          [product_weight] * product_similarity_df.shape[1]

In [None]:
features = customer_features.values
weighted_features = features * weights

Calculate the cosine similarity

In [None]:
similarity_matrix = cosine_similarity(weighted_features)

Find top 3 look alikes for first 20 customer and save them in csv file

In [None]:

customer_ids = customer_features.index
lookalike_map = {}
for i, customer_id in enumerate(customer_ids):
    scores = list(enumerate(similarity_matrix[i]))
    scores = sorted([s for s in scores if s[0] != i], key=lambda x: x[1], reverse=True)
    top_3 = [(customer_ids[s[0]], round(s[1], 4)) for s in scores[:3]]
    lookalike_map[customer_id] = top_3

lookalike_data = {
    "CustomerID": [],
    "Lookalikes": [],
}

for customer_id in customer_ids[:20]:
    lookalikes = lookalike_map[customer_id]
    lookalike_data["CustomerID"].append(customer_id)
    lookalike_data["Lookalikes"].append(lookalikes)

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model results saved to Lookalike.csv")


Lookalike model results saved to Lookalike.csv
