In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

# Loading the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merging CustomerID and ProductID
transactions = transactions.merge(customers, on='CustomerID', how='left')
transactions = transactions.merge(products, on='ProductID', how='left')

# Feature Engineering
# Total spending by each customer
total_spending = transactions.groupby('CustomerID')['TotalValue'].sum()

# Frequency of product purchases (number of unique products purchased)
product_frequency = transactions.groupby('CustomerID')['ProductID'].nunique()

# Creating customer profile dataframe
customer_profile = pd.DataFrame({
    'TotalSpending': total_spending,
    'ProductFrequency': product_frequency
}).reset_index()

# Normalizing features
scaler = StandardScaler()
customer_profile[['TotalSpending', 'ProductFrequency']] = scaler.fit_transform(
    customer_profile[['TotalSpending', 'ProductFrequency']]
)

profile_matrix = customer_profile[['TotalSpending', 'ProductFrequency']].values
similarity_matrix = cosine_similarity(profile_matrix)

# Generating lookalikes for the first 20 customers (C0001 to C0020)
lookalikes = defaultdict(list)
for idx, customer_id in enumerate(customer_profile['CustomerID'].iloc[:20]):
    similarities = similarity_matrix[idx]
    similar_indices = np.argsort(similarities)[-4:-1] 
    similar_customers = customer_profile['CustomerID'].iloc[similar_indices]
    similar_scores = similarities[similar_indices]
    
    for sim_customer, score in zip(similar_customers, similar_scores):
        lookalikes[customer_id].append((sim_customer, score))

# Saving the lookalikes to a CSV file
lookalike_data = []
for cust_id, similar_list in lookalikes.items():
    for sim_customer, score in similar_list:
        lookalike_data.append([cust_id, sim_customer, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['cust_id', 'lookalike_cust_id', 'similarity_score'])
lookalike_df.to_csv('Sudip_Bhunia_Lookalike.csv', index=False)



In [None]:
'''
Explanation of the Code:
Data Preprocessing:
    The Customers.csv, Products.csv, and Transactions.csv files are loaded 
    and merged based on common identifiers (CustomerID, ProductID).
    
Feature Engineering:
    We create two features: TotalSpending (total amount spent by each customer) 
    and ProductFrequency (the number of distinct products purchased by each customer).
    
Similarity Calculation:
    We standardize the features and then calculate cosine similarity between customer profiles. 
    This gives us a measure of similarity between customers based on spending and product frequency.
    
Generating Lookalikes:
    For each of the first 20 customers (C0001 - C0020), we find the top 3 most similar customers based on the similarity matrix.
    
Saving the Output:
The final recommendations are saved in the Sudip_Bhunia_Lookalike.csv file, which contains the cust_id, lookalike_cust_id, and their similarity score.

'''