Task 2: Lookalike Model

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Reload the datasets
customers = pd.read_csv('/Users/aman_r/Documents/Data Science Assignment/DataSets/Customers.csv')
products = pd.read_csv('/Users/aman_r/Documents/Data Science Assignment/DataSets/Products.csv')
transactions = pd.read_csv('/Users/aman_r/Documents/Data Science Assignment/DataSets/Transactions.csv')

# Merge Customers and Transactions data on CustomerID
merged_data = transactions.merge(customers, on='CustomerID')

# Encode categorical variables for similarity calculation
label_encoder = LabelEncoder()
merged_data['Region_Encoded'] = label_encoder.fit_transform(merged_data['Region'])

# Aggregate features for each customer
customer_features = merged_data.groupby('CustomerID').agg({
    'Region_Encoded': 'first',  # Region as a categorical feature
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',  # Total quantity purchased
}).reset_index()

# Normalize numerical features
customer_features[['TotalValue', 'Quantity']] = customer_features[['TotalValue', 'Quantity']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

# Create a feature matrix
feature_matrix = customer_features[['Region_Encoded', 'TotalValue', 'Quantity']].values

# Compute cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

# Retrieve top 3 similar customers for CustomerIDs C0001–C0020
customer_ids = customer_features['CustomerID'].tolist()
lookalike_results = {}

for idx, customer_id in enumerate(customer_ids[:20]):  # First 20 customers
    # Get similarity scores for the current customer
    scores = similarity_matrix[idx]
    # Get top 3 similar customers excluding itself
    top_indices = np.argsort(scores)[::-1][1:4]
    lookalike_results[customer_id] = [(customer_ids[i], scores[i]) for i in top_indices]

# Convert results to a DataFrame for easier export
lookalike_df = pd.DataFrame([{'CustomerID': key, 'Lookalikes': value} for key, value in lookalike_results.items()])

# Display the lookalike results for review
lookalike_df.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0107, 0.9999653443811154), (C0137, 0.999944..."
1,C0002,"[(C0043, 0.9996660509784008), (C0136, 0.999606..."
2,C0003,"[(C0157, 0.9999852171328136), (C0029, 0.999982..."
3,C0004,"[(C0132, 0.9999482716616489), (C0113, 0.999822..."
4,C0005,"[(C0092, 0.9999995165903441), (C0022, 0.999997..."
