In [2]:
# Lookalike Model Development

## Importing Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

## Loading Data
customers = pd.read_csv('/Volumes/DATA/Assignment/Data/Customers - Customers.csv')
products = pd.read_csv('/Volumes/DATA/Assignment/Data/Products - Products.csv')
transactions = pd.read_csv('/Volumes/DATA/Assignment/Data/Transactions - Transactions.csv')

## Merging Datasets
merged_data = pd.merge(transactions, customers, on='CustomerID', how='inner')
merged_data = pd.merge(merged_data, products, on='ProductID', how='inner')


In [3]:
merged_data.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')

In [13]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from datetime import datetime

# Load merged data
data = merged_data

# Feature Engineering
# Calculate days since signup
data['SignupDate'] = pd.to_datetime(data['SignupDate'])
data['DaysSinceSignup'] = (datetime.now() - data['SignupDate']).dt.days

# Aggregate features per customer
customer_features = data.groupby("CustomerID").agg({
    'Region': 'first',  # Region stays the same for a customer
    'TotalValue': 'mean',  # Average transaction value
    'Quantity': 'sum',  # Total quantity purchased
    'Category': lambda x: x.mode()[0],  # Most frequent product category
    'Price_y': 'mean',  # Average product price
    'DaysSinceSignup': 'first'  # Days since signup
}).reset_index()

# One-hot encode categorical features (Region and Category)
customer_features = pd.get_dummies(customer_features, columns=['Region', 'Category'], drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
numerical_cols = ['TotalValue', 'Quantity', 'Price_y', 'DaysSinceSignup']
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

# Similarity Computation
customer_ids = customer_features['CustomerID']
features = customer_features.drop(columns=['CustomerID']).values
similarity_matrix = cosine_similarity(features)

# Generate Lookalike Recommendations
lookalike_map = {}

for i, cust_id in enumerate(customer_ids[:20]):  # First 20 customers
    scores = list(enumerate(similarity_matrix[i]))
    # Sort by similarity score in descending order, ignoring self-comparison
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]
    lookalike_map[cust_id] = [(customer_ids[j], round(score, 4)) for j, score in sorted_scores]

# Save Lookalike.csv
lookalike_df = pd.DataFrame([
    {"CustomerID": cust_id, "Lookalikes": lookalikes}
    for cust_id, lookalikes in lookalike_map.items()
])
lookalike_df.to_csv("Subasini_K_Lookalike.csv", index=False)

print("Lookalike.csv has been created.")


Shape of features array: (199, 10)
LookalikeRecommendations.xlsx has been created with different similarity metrics.
