In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

## Loading datasets


In [None]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

># Top 3 products purchased by each customer
## Products purchased by each customer

In [None]:
prods_purchased = transactions.groupby(['CustomerID', 'ProductID'])['Quantity'].sum().reset_index(name='PurchaseCount')
#prods_purchased

## Products purchased by each customer, ordered by count (DESC)

In [None]:
top_products = prods_purchased.sort_values(['CustomerID', 'PurchaseCount'], ascending=[True, False])
#top_products

## Top 3 products purchased by each customer

In [None]:
top3_per_customer = top_products.groupby('CustomerID').head(3)
#top3_per_customer

## Top 3 prods converted to set

In [None]:
customer_top3_sets = top3_per_customer.groupby('CustomerID')['ProductID'].apply(set).reset_index()
#customer_top3_sets

># Total Spending by each customer

In [None]:
total_spending = transactions.groupby('CustomerID')['TotalValue'].sum().reset_index(name='TotalSpending')
#total_spending

># Average Spending by each customer per transaction

In [None]:
avg_spending = transactions.groupby('CustomerID')['TotalValue'].mean().reset_index(name='AvgSpending')
#avg_spending

># Number of transactions by each customer

In [None]:
transaction_frequency = transactions.groupby('CustomerID')['TransactionID'].count().reset_index(name='TransactionFrequency')
#transaction_frequency

># No.of days since last purchase

## Finding date of latest purchase

In [None]:
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
last_purchase = transactions.groupby('CustomerID')['TransactionDate'].max().reset_index(name='LastPurchaseDate')
#last_purchase

## Recency in days of lastest purchase

In [None]:
last_purchase['Recency'] = (datetime.now() - last_purchase['LastPurchaseDate']).dt.days
#last_purchase

># Weighted categories purchased by each customer

## Merge Transactions and Products

In [None]:
# Merge transactions with product information
trans_prods = transactions.merge(products, on='ProductID')
#trans_prods

## Count of each category per customer

In [None]:
category_counts = trans_prods.groupby(['CustomerID', 'Category'])['Quantity'].sum().reset_index()

# Sort categories by CustomerID and Quantity purchased (descending)
category_counts = category_counts.sort_values(['CustomerID', 'Quantity'], ascending=[True, False])
#category_counts

## Total (of all categories) per customer

In [None]:
total_quantity_per_customer = category_counts.groupby('CustomerID')['Quantity'].sum().reset_index(name='TotalQuantity')
#total_quantity_per_customer

## Weight per each category

In [None]:
# Merge total quantities to calculate weights
category_counts = category_counts.merge(total_quantity_per_customer, on='CustomerID')
category_counts['CategoryWeight'] = category_counts['Quantity'] / category_counts['TotalQuantity']
#category_counts

## Weight Matrix of prods category for each customer

In [None]:
# Pivot to get a weighted category matrix
weighted_category_matrix = category_counts.pivot(index='CustomerID', columns='Category', values='CategoryWeight').fillna(0)
#weighted_category_matrix

># Merging features considered

In [None]:
customer_features = customers.merge(total_spending, on='CustomerID') \
                              .merge(avg_spending, on='CustomerID') \
                              .merge(transaction_frequency, on='CustomerID') \
                              .merge(last_purchase[['CustomerID', 'Recency']], on='CustomerID') \
                              .merge(weighted_category_matrix, on='CustomerID', how='left').fillna(0)
customer_features

## Check for missing values

In [None]:
customer_features.isnull().sum()

## Check for duplicate values

In [None]:
duplc = customer_features.duplicated(subset='CustomerID')
duplc.sort_values(ascending=False)

duplc

In [None]:
customer_features

># Feauter Scaling

## One-hot Encoding for Categorical cols: 'Region'

In [None]:
customer_features = pd.get_dummies(customer_features, columns=['Region'])
customer_features

In [None]:
region_cols = [col for col in customer_features.columns if col.startswith('Region_')]
region_cols

## Standardization of Numeric cols

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_features = ['TotalSpending', 'AvgSpending', 'TransactionFrequency', 'Recency']

customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])
#customer_features

># Calculations of Similarities using Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
features = customer_features[numerical_features + ['Books', 'Clothing', 'Electronics', 'Home Decor']+region_cols]
similarity_matrix = cosine_similarity(features)
similarity_matrix

## Setting CustomerID as index 

In [None]:
feature_matrix = customer_features.set_index('CustomerID')

## DataFrame for similarities

In [None]:
similarity_df = pd.DataFrame(similarity_matrix, index=feature_matrix.index, columns=feature_matrix.index)
similarity_df

># Mapping top 3 lookalike customers for each customer

In [None]:
lookalikes = {}
for customer_id in feature_matrix.index[:20]:  # First 20 customers (C0001 - C0020)
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]  # Get top 3 similar customers
    lookalikes[customer_id] = [(other_id, round(score, 4)) for other_id, score in similar_customers.items()]

## DataFrame for lookalikes

In [None]:
lookalike_df = pd.DataFrame({
    'CustomerID': lookalikes.keys(),
    'LookalikeCustomers': [str(value) for value in lookalikes.values()]
})
lookalike_df

## Save lookalikes df to .csv file

In [None]:
lookalike_df.to_csv('Lookalike.csv', index=False)