In [2]:
# Importing required libraries
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
# Loading the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Merging transactions and products data
transactions_products = pd.merge(transactions_df, products_df, on='ProductID')
# Creating customer-level features
customer_features = transactions_products.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    total_transactions=('TransactionID', 'count')
).reset_index()

# Adding preferred category (highest spend per customer)
preferred_category = transactions_products.groupby(['CustomerID', 'Category'])['TotalValue'].sum().reset_index()
preferred_category = preferred_category.loc[preferred_category.groupby('CustomerID')['TotalValue'].idxmax()]
customer_features = customer_features.merge(preferred_category[['CustomerID', 'Category']], on='CustomerID')

# One-hot encoding preferred category
customer_features_encoded = pd.get_dummies(customer_features, columns=['Category'])
# Normalizing features for similarity calculation
scaler = MinMaxScaler()
customer_features_encoded.iloc[:, 1:] = scaler.fit_transform(customer_features_encoded.iloc[:, 1:])

# Calculating cosine similarity
customer_similarity = cosine_similarity(customer_features_encoded.iloc[:, 1:])
# Finding top 3 lookalike customers for the first 20 customers
lookalike_results = {}
for idx in range(20):  # First 20 customers (C0001 to C0020)
    customer_id = customer_features_encoded.iloc[idx, 0]
    similarity_scores = list(enumerate(customer_similarity[idx]))
    top_3_similar = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]
    lookalike_results[customer_id] = [(customer_features_encoded.iloc[i[0], 0], i[1]) for i in top_3_similar]

# Displaying lookalike results
for customer, lookalikes in lookalike_results.items():
    print(f'Customer {customer}: {lookalikes}')
    # Exporting the lookalike results to a CSV file
lookalike_df = pd.DataFrame([
    {'CustomerID': customer, 'Lookalikes': str(lookalikes)}
    for customer, lookalikes in lookalike_results.items()
])
lookalike_df.to_csv('Lookalike.csv', index=False)
print('Lookalike results saved to Lookalike.csv')

Customer C0001: [('C0072', 0.9987768952588382), ('C0190', 0.9986570128515178), ('C0183', 0.997888611572081)]
Customer C0002: [('C0029', 0.9998906529958622), ('C0010', 0.9996066053826445), ('C0142', 0.9975354915923277)]
Customer C0003: [('C0125', 0.9991683543016404), ('C0001', 0.9958032792344125), ('C0005', 0.995146186471467)]
Customer C0004: [('C0173', 0.9992741842297721), ('C0012', 0.9965426263119812), ('C0124', 0.9963518227026218)]
Customer C0005: [('C0112', 0.9998592708958555), ('C0186', 0.9980459599013688), ('C0120', 0.9957493033121506)]
Customer C0006: [('C0171', 0.9957133679439155), ('C0115', 0.9949286282199495), ('C0074', 0.9946286419553937)]
Customer C0007: [('C0085', 0.9999935078160307), ('C0120', 0.9997639568358302), ('C0050', 0.9959923254926464)]
Customer C0008: [('C0109', 0.9967738566864514), ('C0098', 0.9919961120623849), ('C0093', 0.9911523692955455)]
Customer C0009: [('C0077', 0.9998716816894867), ('C0083', 0.997604642575368), ('C0010', 0.9893498885438232)]
Customer C001

 0.5 0.  0.7 0.5 0.5 0.6 0.3 0.3 0.3 0.7 0.3 0.4 0.3 0.2 0.  0.5 0.3 0.2
 0.4 0.4 0.5 0.2 0.6 0.2 0.2 0.2 0.6 0.6 0.6 0.4 0.7 0.2 0.6 0.3 0.5 0.7
 0.4 0.4 0.5 0.  0.7 0.  0.2 0.2 0.1 0.4 0.9 0.2 0.5 0.7 0.4 0.3 0.2 0.4
 0.2 0.2 0.8 0.4 0.2 0.  0.3 0.  0.5 0.6 0.2 0.8 0.2 0.4 0.6 0.4 0.1 0.6
 0.5 0.3 0.8 0.3 0.1 0.4 0.  0.7 0.7 0.5 0.7 0.7 0.4 0.7 0.5 0.4 0.4 0.6
 1.  0.  0.4 0.2 0.7 0.4 0.2 0.5 0.3 0.5 0.4 0.2 0.3 0.6 0.1 0.6 0.3 0.5
 0.5 0.1 0.2 0.  0.2 0.2 0.3 0.4 0.3 0.5 0.4 0.4 0.7 0.1 0.9 0.3 0.6 0.1
 0.7 0.3 0.8 0.4 0.4 0.  0.  0.4 0.5 0.5 0.6 1.  0.4 0.3 0.2 0.4 0.7 0.7
 0.5 0.4 0.8 0.3 0.2 0.3 0.5 0.4 0.4 0.5 0.7 0.4 0.9 0.2 0.3 0.3 0.5 0.5
 0.4 0.4 0.6 0.2 0.2 0.4 0.7 0.4 0.4 0.4 0.3 0.3 0.6 0.5 0.3 0.2 0.1 0.3
 0.4]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  customer_features_encoded.iloc[:, 1:] = scaler.fit_transform(customer_features_encoded.iloc[:, 1:])
 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
 