This implementation:
- Uses explainable features for similarity matching
- Handles both categorical and numerical data
- Provides similarity scores for transparency
- Follows best practices for recommendation systems
- Scales to large datasets efficiently

In [14]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")

In [15]:
# Convert dates
current_date = pd.to_datetime('2025-01-27')
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])


In [16]:
merged = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')

In [18]:
# Feature Engineering
# 1. Customer Tenure
customers['Tenure'] = (current_date - customers['SignupDate']).dt.days // 30

# 2. Transaction Behavior
transaction_features = merged.groupby('CustomerID').agg(
    Total_Transactions=('TransactionID', 'count'),
    Total_Spent=('TotalValue', 'sum'),
    Favorite_Category=('Category', lambda x: x.mode()[0])
).reset_index()

# 3. Product Interaction
product_features = merged.groupby('CustomerID').agg(
    Unique_Products=('ProductID', 'nunique'),
    Avg_Product_Price=('Price_x', 'mean')
).reset_index()

# Combine features
features = customers.merge(transaction_features, on='CustomerID')\
                   .merge(product_features, on='CustomerID')


In [19]:
# Preprocessing
features = pd.get_dummies(features, columns=['Region', 'Favorite_Category'])

# Select relevant columns
feature_cols = ['Tenure', 'Total_Transactions', 'Total_Spent', 
                'Unique_Products', 'Avg_Product_Price'] + \
               [col for col in features.columns if col.startswith(('Region_', 'Favorite_Category_'))]

In [20]:
# Normalize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features[feature_cols])


In [21]:
# Calculate similarity matrix
similarity_matrix = cosine_similarity(scaled_features)

In [22]:
# Generate recommendations
results = []
target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)]

for cust_id in target_customers:
    idx = features[features['CustomerID'] == cust_id].index[0]
    similarities = list(enumerate(similarity_matrix[idx]))
    
    # Sort and exclude self
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_matches = [(features.iloc[i]['CustomerID'], round(score, 4)) 
                   for i, score in similarities 
                   if features.iloc[i]['CustomerID'] != cust_id][:3]
    
    results.append({
        'CustomerID': cust_id,
        'Lookalike_1': f"{top_matches[0][0]}({top_matches[0][1]})",
        'Lookalike_2': f"{top_matches[1][0]}({top_matches[1][1]})",
        'Lookalike_3': f"{top_matches[2][0]}({top_matches[2][1]})"
    })


In [23]:
lookalike_df = pd.DataFrame(results)
lookalike_df.to_csv('Ardhi_Dattatreya_Varma_Lookalike.csv', index=False)