In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Inspect the first few rows of each dataset
print(customers.head())
print(products.head())
print(transactions.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [2]:
# Merge transactions with product data to get product category
transactions = transactions.merge(products[['ProductID', 'Category']], on='ProductID')

# Aggregate transaction data per customer
customer_transactions = transactions.groupby(['CustomerID', 'Category']).agg(
    total_spent=('TotalValue', 'sum'),
    total_quantity=('Quantity', 'sum')
).reset_index()

# Create a pivot table for customers with product categories as columns
customer_profile = customer_transactions.pivot_table(
    index='CustomerID',
    columns='Category',
    values='total_spent',
    aggfunc='sum',
    fill_value=0
)

# Include customer demographics (region, signup date) in the profile
customer_profile = customer_profile.merge(customers[['CustomerID', 'Region']], on='CustomerID')

In [3]:
# Standardize transaction data (numeric features only)
scaler = StandardScaler()
transaction_features = customer_profile.drop(['CustomerID', 'Region'], axis=1)
transaction_features_scaled = pd.DataFrame(scaler.fit_transform(transaction_features), columns=transaction_features.columns)

# Include the region (non-numeric) as it can also contribute to similarity
customer_profile_scaled = pd.concat([customer_profile[['CustomerID', 'Region']], transaction_features_scaled], axis=1)

In [4]:
# Extract the feature vectors (excluding CustomerID and Region)
customer_vectors = customer_profile_scaled.drop(['CustomerID', 'Region'], axis=1)

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(customer_vectors)

# Convert similarity matrix to a DataFrame for easier analysis
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile_scaled['CustomerID'], columns=customer_profile_scaled['CustomerID'])

# Inspect the similarity matrix
print(similarity_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.402215  0.648350  0.043313  0.661203 -0.960708   
C0002      -0.402215  1.000000  0.175482 -0.446094  0.257825  0.235584   
C0003       0.648350  0.175482  1.000000  0.328565  0.932178 -0.734670   
C0004       0.043313 -0.446094  0.328565  1.000000  0.092857 -0.005891   
C0005       0.661203  0.257825  0.932178  0.092857  1.000000 -0.814067   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.637812 -0.268011  0.171019 -0.381244  ... -0.059019  0.830892   
C0002       0.166689  0.470266  0.588281  0.703980  ... -0.527737 -0.050379   
C0003       0.996881  0.202597  0.198752 -0.372100  ... -0.448319  0.462407   
C0004       0.347577  0.112209 -0.725347 -0.913367  ...  0.068221 -0.451726   
C0005  

In [5]:
# Define the list of customers we are interested in (C0001 to C0020)
target_customers = ['C0001', 'C0002', 'C0003', 'C0004', 'C0005', 'C0006', 'C0007', 'C0008', 'C0009', 'C0010',
                    'C0011', 'C0012', 'C0013', 'C0014', 'C0015', 'C0016', 'C0017', 'C0018', 'C0019', 'C0020']

# Create a dictionary to store the lookalike recommendations
lookalike_map = defaultdict(list)

# For each target customer, get top 3 similar customers (excluding themselves)
for customer_id in target_customers:
    similarities = similarity_df[customer_id].sort_values(ascending=False)  # Sort by similarity score
    top_similar_customers = similarities.drop(customer_id).head(3)  # Exclude the customer itself
    for similar_customer_id, score in top_similar_customers.items():
        lookalike_map[customer_id].append((similar_customer_id, score))

# Convert the lookalike map to a DataFrame
lookalike_data = []
for customer_id, similar_customers in lookalike_map.items():
    for similar_customer_id, score in similar_customers:
        lookalike_data.append([customer_id, similar_customer_id, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])

# Inspect the results
print(lookalike_df.head())

  CustomerID LookalikeID  SimilarityScore
0      C0001       C0091         0.988848
1      C0001       C0069         0.984344
2      C0001       C0184         0.978562
3      C0002       C0159         0.979511
4      C0002       C0036         0.956762


In [6]:
# Save the recommendations to a CSV file
lookalike_df.to_csv('Raman_Kumar_Lookalike.csv', index=False)