In [33]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import json

In [34]:
# Load data
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [43]:
# Step 1: Data Preprocessing
# Merge datasets
transactions_products = pd.merge(transactions, products, on="ProductID", how="inner")
merged_data = pd.merge(transactions_products, customers, on="CustomerID", how="inner")

print("Merged Data is",merged_data.head())

Merged Data is   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00761      C0199      P022  2024-10-01 05:57:09         4   
2        T00626      C0199      P079  2024-08-17 12:06:08         2   
3        T00963      C0199      P008  2024-10-26 00:01:58         2   
4        T00112      C0146      P067  2024-05-27 22:23:54         1   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      550.16   137.54               HomeSense Wall Art   Home Decor   137.54   
2      834.74   417.37                   ActiveWear Rug   Home Decor   417.37   
3      293.70   146.85      BookWorld Bluetooth Speaker  Electronics   146.85   
4      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName  Region  SignupDate  
0   Andrea Jenkins  Europe  20

In [36]:
# Step 2: Feature Engineering
# Aggregate transaction data by CustomerID
# Use the appropriate 'Price' column
if 'Price_y' in merged_data.columns:
    price_column = 'Price_y'  # Assuming 'Price_y' is from products dataset
elif 'Price_x' in merged_data.columns:
    price_column = 'Price_x'  # Fallback in case the correct one is 'Price_x'
else:
    raise KeyError("No valid 'Price' column found in merged_data.")

# Aggregate transaction data by CustomerID
customer_features = merged_data.groupby("CustomerID").agg({
    "Quantity": "sum",         # Total products purchased
    price_column: "mean",      # Average price of purchased products
    "TotalValue": "sum",       # Total transaction value
    "Region": "first",         # Keep the region as a feature
}).reset_index()

# Rename the price column for clarity
customer_features.rename(columns={price_column: "AveragePrice"}, inplace=True)

In [37]:
# Encode categorical features (e.g., Region)
customer_features = pd.get_dummies(customer_features, columns=["Region"])

In [38]:
# Normalize numerical features
scaler = StandardScaler()
numerical_cols = ["Quantity", "AveragePrice", "TotalValue"]  # Use the renamed 'Price' column
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

In [46]:
# Step 3: Compute Similarity
# Compute the similarity matrix using cosine similarity
feature_matrix = customer_features.drop(columns=["CustomerID"]).values
similarity_matrix = cosine_similarity(feature_matrix)

In [40]:
# Step 4: Find Top 3 Lookalikes for Each Customer
lookalike_map = {}
for idx, customer_id in enumerate(customer_features["CustomerID"]):
    # Get similarity scores for the current customer
    similarity_scores = similarity_matrix[idx]
    
    # Exclude the current customer (self-similarity)
    similar_customers = [
        (customer_features["CustomerID"].iloc[i], similarity_scores[i]) 
        for i in range(len(similarity_scores)) if i != idx
    ]
    
    # Sort by similarity score in descending order and take top 3
    top_3_similar = sorted(similar_customers, key=lambda x: x[1], reverse=True)[:3]
    
    # Add to the lookalike map
    lookalike_map[customer_id] = top_3_similar

In [41]:
# Step 5: Generate Lookalike.csv for the First 20 Customers
lookalike_subset = {cust_id: lookalike_map[cust_id] for cust_id in customer_features["CustomerID"][:20]}

# Convert to desired format and save as CSV
lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "lookalikes": json.dumps(lookalike_subset[cust_id])}
    for cust_id in lookalike_subset
])
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv has been created with top 3 lookalikes for the first 20 customers.")

Lookalike.csv has been created with top 3 lookalikes for the first 20 customers.
