In [1]:
#Data Preparation
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#Loading the dataset
customers = pd.read_csv(r'C:\Users\Shraddha\Downloads\Zeotap Assignment\Customers.csv')
products = pd.read_csv(r'C:\Users\Shraddha\Downloads\Zeotap Assignment\Products.csv')
transactions = pd.read_csv(r'C:\Users\Shraddha\Downloads\Zeotap Assignment\Transactions.csv')

In [3]:
#Merging the datasets
merged_data = pd.merge(transactions, customers, on = 'CustomerID')
merged_data = pd.merge(merged_data, products, on = 'ProductID')

In [4]:
#Feature Engineering
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue':['sum', 'mean'],
    'TransactionID': 'count',
    'Category': lambda x: x.mode()[0],
    'Region': 'first',
    'SignupDate': 'first'
}).reset_index()

customer_features.columns = ['CustomerID', 'TotalValueSum', 'TotalValueMean', 'TransactionCount', 'TopCategory', 'Region', 'SignupDate']

In [5]:
#Normalizing
scaler = StandardScaler()
customer_features[['TotalValueSum', 'TotalValueMean', 'TransactionCount']] = scaler.fit_transform(
    customer_features[['TotalValueSum', 'TotalValueMean', 'TransactionCount']]
)

In [6]:
#Calculating similarity
encoded_features = pd.get_dummies(customer_features, columns=['Region', 'TopCategory'])
similarity_matrix = cosine_similarity(encoded_features.drop(columns=['CustomerID', 'SignupDate']))

In [7]:
#Finding top 3 lookalikes
lookalike_map = {}
for i, customer_id in enumerate(customer_features['CustomerID']):
    similar_indices = similarity_matrix[i].argsort()[-4:-1][::-1]  # Top 3 excluding self
    similar_customers = [
        (customer_features.iloc[idx]['CustomerID'], similarity_matrix[i, idx]) 
        for idx in similar_indices
    ]
    lookalike_map[customer_id] = similar_customers

In [8]:
#Filtering first 20 customers
lookalike_subset = {k: lookalike_map[k] for k in customer_features['CustomerID'][:20]}

In [9]:
#Saving to CSV
output_df = pd.DataFrame({
    'CustomerID': lookalike_subset.keys(),
    'Lookalikes': [str(v) for v in lookalike_subset.values()]
})
output_df.to_csv('Shraddha_Harihar_Lookalike.csv', index=False)