In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv(r"C:\Users\ajayc\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\ajayc\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Users\ajayc\Downloads\Transactions.csv")

# Merge datasets
data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Feature engineering
customer_features = data.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    avg_spend_per_transaction=('TotalValue', 'mean'),
    total_transactions=('TransactionID', 'nunique'),
    avg_quantity=('Quantity', 'mean'),
    most_frequent_category=('Category', lambda x: x.mode()[0] if len(x) > 0 else 'Unknown')
).reset_index()

# One-hot encode the categorical features
customer_features = pd.get_dummies(customer_features, columns=['most_frequent_category'], drop_first=True)

# Normalize features
scaler = StandardScaler()
feature_columns = [col for col in customer_features.columns if col not in ['CustomerID']]
customer_features_scaled = customer_features.copy()
customer_features_scaled[feature_columns] = scaler.fit_transform(customer_features[feature_columns])

# Compute cosine similarity
similarity_matrix = cosine_similarity(customer_features_scaled[feature_columns])

# Get top 3 lookalikes for the first 20 customers
lookalike_map = {}
for i in range(20):  # For C0001 to C0020
    customer_id = customer_features_scaled['CustomerID'].iloc[i]
    similarity_scores = list(enumerate(similarity_matrix[i]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]
    lookalikes = [(customer_features_scaled['CustomerID'].iloc[j], score) for j, score in sorted_scores]
    lookalike_map[customer_id] = lookalikes

# Save lookalikes to a CSV file
lookalike_df = pd.DataFrame([
    {'cust_id': cust, 'lookalikes': str([(lk[0], round(lk[1], 3)) for lk in lookalike_map[cust]])}
    for cust in lookalike_map
])
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)

# Display the lookalike map for the first 20 customers
lookalike_df.head()

Unnamed: 0,cust_id,lookalikes
0,C0001,"[('C0055', 0.971), ('C0048', 0.97), ('C0072', ..."
1,C0002,"[('C0029', 0.999), ('C0030', 0.966), ('C0062',..."
2,C0003,"[('C0136', 0.897), ('C0110', 0.88), ('C0160', ..."
3,C0004,"[('C0075', 0.99), ('C0165', 0.962), ('C0017', ..."
4,C0005,"[('C0186', 0.986), ('C0095', 0.971), ('C0112',..."
