# Customer Lookalike Model Analysis

This notebook implements a lookalike model to find similar customers based on their profiles and transaction history.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import json

In [2]:
# Load the datasets
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')

# Calculate customer age on platform
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
current_date = datetime(2025, 1, 27)
customers_df['days_on_platform'] = (current_date - customers_df['SignupDate']).dt.days

# Create region encoding
region_dummies = pd.get_dummies(customers_df['Region'], prefix='region')
customers_df = pd.concat([customers_df, region_dummies], axis=1)

# Process transactions
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
trans_products = transactions_df.merge(products_df, on='ProductID')

# Calculate customer-level features
customer_features = trans_products.groupby('CustomerID').agg({
    'TransactionID': 'count',
    'Quantity': ['sum', 'mean'],
    'TotalValue': ['sum', 'mean'],
    'Category': lambda x: list(set(x))
}).reset_index()

# Flatten column names
customer_features.columns = ['CustomerID', 'num_transactions', 'total_quantity', 
                           'avg_quantity', 'total_spend', 'avg_spend', 'categories']

# Create category preferences
all_categories = products_df['Category'].unique()
for category in all_categories:
    customer_features[f'category_{category}'] = customer_features['categories'].apply(
        lambda x: 1 if category in x else 0)

customer_features.drop('categories', axis=1, inplace=True)

# Merge all features
final_features = customers_df.merge(customer_features, on='CustomerID', how='left').fillna(0)

In [3]:
# Select features for similarity calculation
feature_columns = ['days_on_platform', 'num_transactions', 'total_quantity', 
                  'avg_quantity', 'total_spend', 'avg_spend'] + \
                 [col for col in final_features.columns if col.startswith('region_')] + \
                 [col for col in final_features.columns if col.startswith('category_')]

# Scale features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(final_features[feature_columns])

# Calculate similarity matrix
similarity_matrix = cosine_similarity(features_scaled)

In [4]:
def generate_lookalike_map(final_features, similarity_matrix, target_customers):
    lookalike_map = {}
    
    for target_id in target_customers:
        target_idx = final_features[final_features['CustomerID'] == target_id].index[0]
        similarities = similarity_matrix[target_idx]
        
        # Get top 3 similar customers (excluding self)
        similar_indices = np.argsort(similarities)[::-1][1:4]
        
        # Create list of [customer_id, score] pairs
        similar_customers = [
            [final_features.iloc[idx]['CustomerID'], round(similarities[idx], 4)]
            for idx in similar_indices
        ]
        
        lookalike_map[target_id] = similar_customers
    
    return lookalike_map

# Generate target customer IDs (C0001-C0020)
target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)]

# Generate lookalike map
lookalike_map = generate_lookalike_map(final_features, similarity_matrix, target_customers)

# Save to CSV in the required format
with open('Lookalike.csv', 'w') as f:
    f.write(json.dumps(lookalike_map))

# Display example results
print("Sample of lookalike map:")
for i in range(3):  # Show first 3 entries
    customer_id = target_customers[i]
    print(f"\n{customer_id}: {lookalike_map[customer_id]}")

Sample of lookalike map:

C0001: [['C0174', 0.9786], ['C0152', 0.9675], ['C0004', 0.6992]]

C0002: [['C0159', 0.9489], ['C0134', 0.9094], ['C0106', 0.7741]]

C0003: [['C0129', 0.9211], ['C0195', 0.8585], ['C0091', 0.8527]]
