Import all required librabries

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

Import All required datasets

In [10]:
from google.colab import files

# Upload the CSV file
uploaded = files.upload()
# Load the CSV into a DataFrame
customers = pd.read_csv(list(uploaded.keys())[0])
uploaded = files.upload()
transactions = pd.read_csv(list(uploaded.keys())[0])
uploaded = files.upload()
products = pd.read_csv(list(uploaded.keys())[0])


Saving Customers.csv to Customers (2).csv


Saving Transactions.csv to Transactions (2).csv


Saving Products.csv to Products (2).csv


Data Preprocessing

In [11]:
# Merge Datasets
# Merging Transactions and Products on ProductID
transactions_products = pd.merge(transactions, products, on='ProductID', how='left')

# Merging the result with Customers on CustomerID
merged_data = pd.merge(transactions_products, customers, on='CustomerID', how='left')

Feature Engineering:

In [12]:
# Total money spent by each customer
customer_spending = merged_data.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_spending.columns = ['CustomerID', 'TotalSpent']

# Average quantity purchased by each customer
customer_quantity = merged_data.groupby('CustomerID')['Quantity'].mean().reset_index()
customer_quantity.columns = ['CustomerID', 'AvgQuantity']

# Favorite product category for each customer
customer_favorite_category = merged_data.groupby(['CustomerID', 'Category']).size().reset_index(name='Counts')
customer_favorite_category = customer_favorite_category.loc[customer_favorite_category.groupby('CustomerID')['Counts'].idxmax()]
customer_favorite_category = customer_favorite_category[['CustomerID', 'Category']]


Profile-based Features: Convert categorical variables such as Region into numerical form using encoding methods.

In [13]:
# One-hot encoding for Region
customer_data = pd.get_dummies(customers, columns=['Region'], drop_first=True)

Combine Features: Create a final dataset that includes all the engineered features for customers, merging them to form a comprehensive customer profile.

In [14]:
customer_profile = pd.merge(customer_spending, customer_quantity, on='CustomerID')
customer_profile = pd.merge(customer_profile, customer_favorite_category, on='CustomerID')

Model Development – Similarity Calculation

In [15]:
# Data Standardization
scaler = StandardScaler()
customer_profile_scaled = scaler.fit_transform(customer_profile[['TotalSpent', 'AvgQuantity']])

In [16]:
# Cosine Similarity
similarity_matrix = cosine_similarity(customer_profile_scaled)

Recommend Similar Customers

In [17]:
# Lookalike Recommendations
top_n = 3
lookalike_map = {}

for idx, customer_id in enumerate(customer_profile['CustomerID']):
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))

    # Sort based on similarity scores and exclude self-similarity (index == idx)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    # Store the top 3 similar customers with their similarity scores
    similar_customers = [(customer_profile.iloc[i[0]]['CustomerID'], round(i[1], 4)) for i in similarity_scores]
    lookalike_map[customer_id] = similar_customers


Generate Lookalike CSV for the First 20 Customers

In [22]:
lookalike_results = []

for customer_id in customer_profile['CustomerID'][:20]:
    lookalike_results.append([customer_id, lookalike_map[customer_id]])

lookalike_df = pd.DataFrame(lookalike_results, columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv('Shreyas_Peherkar_Lookalike.csv', index=False)