In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge Transactions with Customers and Products
merged_data = transactions.merge(customers, on='CustomerID', how='left')
merged_data = merged_data.merge(products, on='ProductID', how='left')

# Check the first few rows
print(merged_data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

**Feature Engineering**
We will build a profile for each customer that includes:

Demographics (region, signup date).

Transaction history (products purchased, spending, and purchase frequency)

In [None]:
# One-hot encode the region feature
region_encoder = LabelEncoder()
customers['Region_encoded'] = region_encoder.fit_transform(customers['Region'])

# Calculate the total amount spent by each customer and the number of purchases per product category
customer_spending = merged_data.groupby(['CustomerID', 'Category'])['TotalValue'].sum().unstack(fill_value=0)

# Merge demographics with spending data
customer_profiles = customers.set_index('CustomerID').join(customer_spending, on='CustomerID')

# Check customer profiles
print(customer_profiles.head())

                  CustomerName         Region  SignupDate  Region_encoded  \
CustomerID                                                                  
C0001         Lawrence Carroll  South America  2022-07-10               3   
C0002           Elizabeth Lutz           Asia  2022-02-13               0   
C0003           Michael Rivera  South America  2024-03-07               3   
C0004       Kathleen Rodriguez  South America  2022-10-09               3   
C0005              Laura Weber           Asia  2022-08-15               0   

              Books  Clothing  Electronics  Home Decor  
CustomerID                                              
C0001        114.60      0.00      2827.30      412.62  
C0002          0.00   1025.46         0.00      837.28  
C0003          0.00    122.36      1385.20     1217.82  
C0004       1888.48      0.00      1355.74     2110.66  
C0005          0.00      0.00      1180.38      853.86  


** Calculate Similarity Scores**

We'll use cosine similarity to calculate the similarity between customers based on their profiles.

In [None]:
# Fill missing values with 0 for missing categories
customer_profiles = customer_profiles.fillna(0)

# Calculate cosine similarity between customer profiles
similarity_matrix = cosine_similarity(customer_profiles.drop(columns=['CustomerName', 'SignupDate', 'Region']))
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles.index, columns=customer_profiles.index)

# Check the similarity matrix for one customer (e.g., C0001)
print(similarity_df['C0001'].head())

CustomerID
C0001    1.000000
C0002    0.091260
C0003    0.835992
C0004    0.548001
C0005    0.885670
Name: C0001, dtype: float64


**Generate Top 3 Lookalikes for Each Customer**

We’ll generate the top 3 most similar customers for each customer from C0001 to C0020.


In [None]:
# Generate the Lookalike model for the first 20 customers
lookalike_recommendations = defaultdict(list)

for customer_id in customers['CustomerID'][:20]:
    # Get the similarity scores for the current customer
    similarities = similarity_df[customer_id]

    # Sort by similarity score (descending) and exclude the customer itself (similarity score = 1.0)
    sorted_similarities = similarities.sort_values(ascending=False)
    top_3_similar_customers = sorted_similarities[1:4]  # Exclude the first one, which is the customer itself

    # Add the top 3 similar customers to the recommendation list
    for similar_customer_id, score in top_3_similar_customers.items():
        lookalike_recommendations[customer_id].append((similar_customer_id, score))

# Convert recommendations into a DataFrame
lookalike_data = []

for customer_id, recommendations in lookalike_recommendations.items():
    for similar_customer_id, score in recommendations:
        lookalike_data.append([customer_id, similar_customer_id, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])

# Save the lookalike recommendations to a CSV
lookalike_df.to_csv('Lookalike.csv', index=False)