In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import json

# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Convert to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Extract year and month
customers['SignupYear'] = customers['SignupDate'].dt.year
transactions['TransactionMonth'] = transactions['TransactionDate'].dt.month
transactions['YearMonth'] = transactions['TransactionDate'].dt.to_period('M')

# Merge datasets to combine transaction, product, and customer information
transactions = transactions.merge(products, on="ProductID", how="left")
transactions = transactions.merge(customers, on="CustomerID", how="left")

# Aggregate features for each customer
customer_features = transactions.groupby("CustomerID").agg(
    total_spending=("TotalValue", "sum"),
    total_transactions=("TransactionID", "count"),
    avg_quantity=("Quantity", "mean"),
    favorite_category=("Category", lambda x: x.mode()[0] if len(x.mode()) > 0 else "Unknown"),
).reset_index()

# One-hot encode the favorite category
customer_features = pd.get_dummies(customer_features, columns=["favorite_category"], prefix="cat")

# Merge back with demographic data
customer_features = customer_features.merge(
    customers[["CustomerID", "Region", "SignupYear"]], on="CustomerID", how="left"
)

# One-hot encode the Region column
customer_features = pd.get_dummies(customer_features, columns=["Region"], prefix="region")

# Standardize numeric features
scaler = StandardScaler()
numeric_features = ["total_spending", "total_transactions", "avg_quantity", "SignupYear"]
customer_features[numeric_features] = scaler.fit_transform(customer_features[numeric_features])

# Compute pairwise similarity
feature_matrix = customer_features.drop(columns=["CustomerID"]).values
similarity_matrix = cosine_similarity(feature_matrix)

# Get top 3 similar customers for each customer
lookalikes = {}
customer_ids = customer_features["CustomerID"].tolist()

for idx, customer_id in enumerate(customer_ids):
    # Get similarity scores for the current customer
    similarities = list(enumerate(similarity_matrix[idx]))
    # Exclude the current customer and sort by similarity score
    similarities = sorted([s for s in similarities if s[0] != idx], key=lambda x: x[1], reverse=True)[:3]
    # Map to customer IDs and scores
    lookalikes[customer_id] = [(customer_ids[i], round(score, 4)) for i, score in similarities]

# Create a DataFrame for the first 20 customers
lookalike_df = pd.DataFrame({
    "cust_id": list(lookalikes.keys())[:20],
    "lookalikes": [
        json.dumps([{"cust_id": cust_id, "score": round(score, 4)} for cust_id, score in lookalikes[cust_id]])
        for cust_id in list(lookalikes.keys())[:20]
    ],
})

# Save to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv has been created!")


Lookalike.csv has been created!
