In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [2]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
# Merging the customers with transactions on 'CustomerID' and 'ProductID'
merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')
merged_data = pd.merge(merged_data, products, on='ProductID', how='left')

In [4]:
# Creating total spend and purchase frequency for each customer
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',      # Total value of purchases for each customer
    'TransactionID': 'count', # Number of transactions
    'Price_y': 'mean',        # Average product price (use 'Price_y')
    'Region': 'first'         # Region (since it's the same for a customer)
}).reset_index()

# Rename Price_y to 'AvgPrice' to make it more readable
customer_features.rename(columns={'Price_y': 'AvgPrice'}, inplace=True)

# Normalize numerical features (TotalValue, Transaction Count, AvgPrice)
scaler = StandardScaler()
customer_features[['TotalValue', 'TransactionID', 'AvgPrice']] = scaler.fit_transform(customer_features[['TotalValue', 'TransactionID', 'AvgPrice']])

customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

# Check customer feature dataframe
customer_features.head()

Unnamed: 0,CustomerID,TotalValue,TransactionID,AvgPrice,Region_Europe,Region_North America,Region_South America
0,C0001,-0.061701,-0.011458,0.09467,0,0,1
1,C0002,-0.877744,-0.467494,-0.904016,0,0,0
2,C0003,-0.405857,-0.467494,-1.094109,0,0,1
3,C0004,1.032547,1.35665,-0.447702,0,0,1
4,C0005,-0.783929,-0.92353,0.285581,0,0,0


In [5]:
# Extract features for similarity calculation
features = customer_features.drop(columns='CustomerID')

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(features)

# Create a dataframe of similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Check the similarity matrix
similarity_df.head()

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,-0.019273,0.573288,0.430648,0.068648,0.720575,0.095237,-0.041681,0.066837,-0.042914,...,0.980336,0.78455,0.017605,-0.041312,0.726049,0.060314,0.000942,0.025553,0.018942,-0.012231
C0002,-0.019273,1.0,0.7244,-0.417015,0.515139,-0.472891,-0.077595,-0.264481,0.489901,0.861594,...,0.058391,0.384587,0.746666,-0.148183,0.009793,-0.712494,0.755758,0.755439,0.639816,-0.880862
C0003,0.573288,0.7244,1.0,0.134097,0.218808,0.042883,-0.244628,-0.144542,0.179759,0.68547,...,0.553534,0.597265,0.514992,-0.047422,0.6246,-0.597605,0.533077,0.474852,0.390723,-0.527962
C0004,0.430648,-0.417015,0.134097,1.0,-0.868342,0.137133,-0.725902,0.724172,-0.711974,-0.188871,...,0.304759,-0.186921,-0.762778,0.563717,0.847531,-0.140618,-0.524706,-0.680797,-0.470131,0.345768
C0005,0.068648,0.515139,0.218808,-0.868342,1.0,0.193939,0.81392,-0.81032,0.832071,0.24882,...,0.198899,0.644326,0.878887,-0.630979,-0.516037,0.121455,0.61914,0.798033,0.562187,-0.448341


In [6]:
# Create an empty dictionary to store recommendations
recommendations = {}

for cust_id in customer_features['CustomerID'][:20]:
    similar_scores = similarity_df[cust_id]
    similar_customers = similar_scores.drop(cust_id).sort_values(ascending=False).head(3)
    recommendations[cust_id] = list(zip(similar_customers.index, similar_customers.values))

# Convert recommendations into a DataFrame
recommendation_data = []

for cust_id, similar_customers in recommendations.items():
    for rec_id, score in similar_customers:
        recommendation_data.append([cust_id, rec_id, score])

# Convert the flattened list into a DataFrame
recommendations_df = pd.DataFrame(recommendation_data, columns=['CustomerID', 'Recommended_CustomerID', 'Similarity_Score'])

# Save the recommendations to a CSV file
recommendations_df.to_csv('Lookalike.csv', index=False)

# Optionally, print the recommendations for the first 20 customers
print(recommendations_df[recommendations_df['CustomerID'].isin(customer_features['CustomerID'][:20])])

   CustomerID Recommended_CustomerID  Similarity_Score
0       C0001                  C0137          0.989662
1       C0001                  C0152          0.986726
2       C0001                  C0191          0.980336
3       C0002                  C0043          0.990620
4       C0002                  C0027          0.977795
5       C0002                  C0142          0.975448
6       C0003                  C0025          0.955482
7       C0003                  C0112          0.945261
8       C0003                  C0190          0.940840
9       C0004                  C0113          0.979107
10      C0004                  C0108          0.978967
11      C0004                  C0165          0.976775
12      C0005                  C0128          0.993710
13      C0005                  C0123          0.989538
14      C0005                  C0080          0.985969
15      C0006                  C0158          0.937285
16      C0006                  C0168          0.934104
17      C0