In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
df_customers = pd.read_csv('/Customers.csv')
df_products = pd.read_csv('/Products (1).csv')
df_transactions = pd.read_csv('/Transactions.csv')

In [4]:
df_transactions = pd.merge(df_transactions, df_customers[['CustomerID', 'Region']], on='CustomerID', how='left')

# Merge transaction data with product data
df_transactions = pd.merge(df_transactions, df_products[['ProductID', 'Category']], on='ProductID', how='left')


In [5]:
customer_profile = df_transactions.groupby(['CustomerID', 'Region']).agg(
    total_spend=('TotalValue', 'sum'),
    total_transactions=('TransactionID', 'count'),
    unique_products=('ProductID', 'nunique')
).reset_index()

In [6]:
scaler = StandardScaler()
customer_profile[['total_spend', 'total_transactions', 'unique_products']] = scaler.fit_transform(
    customer_profile[['total_spend', 'total_transactions', 'unique_products']])

In [7]:
# Compute customer similarity based on profile features (numerical)
profile_features = customer_profile[['total_spend', 'total_transactions', 'unique_products']].values

# Calculate cosine similarity matrix for customer profiles
customer_similarity = cosine_similarity(profile_features)

# Display the similarity matrix (optional)
print(customer_similarity)


[[ 1.          0.44114968  0.15220261 ...  0.17039607  0.41545374
  -0.72029863]
 [ 0.44114968  1.          0.93017042 ...  0.94614583  0.99934692
  -0.8304382 ]
 [ 0.15220261  0.93017042  1.         ...  0.99813279  0.94282908
  -0.56923855]
 ...
 [ 0.17039607  0.94614583  0.99813279 ...  1.          0.95714768
  -0.61009195]
 [ 0.41545374  0.99934692  0.94282908 ...  0.95714768  1.
  -0.80989468]
 [-0.72029863 -0.8304382  -0.56923855 ... -0.61009195 -0.80989468
   1.        ]]


In [8]:
# Create a transaction matrix for customers by product category
transaction_matrix = df_transactions.pivot_table(index='CustomerID', columns='Category', values='Quantity', aggfunc='sum', fill_value=0)

# Normalize the transaction matrix (optional)
transaction_matrix_normalized = StandardScaler().fit_transform(transaction_matrix)

# Calculate cosine similarity based on transaction history
transaction_similarity = cosine_similarity(transaction_matrix_normalized)

# Display the transaction similarity matrix (optional)
print(transaction_similarity)


[[ 1.         -0.38916864  0.19281314 ...  0.00460424  0.46173566
  -0.89589775]
 [-0.38916864  1.          0.70287821 ...  0.29311796  0.57766211
   0.43262864]
 [ 0.19281314  0.70287821  1.         ... -0.00336114  0.66655233
   0.11836717]
 ...
 [ 0.00460424  0.29311796 -0.00336114 ...  1.          0.21086661
  -0.15370134]
 [ 0.46173566  0.57766211  0.66655233 ...  0.21086661  1.
  -0.45632394]
 [-0.89589775  0.43262864  0.11836717 ... -0.15370134 -0.45632394
   1.        ]]


In [9]:
# Weighting profile similarity and transaction similarity
profile_weight = 0.6
transaction_weight = 0.4

# Combined similarity
combined_similarity = profile_weight * customer_similarity + transaction_weight * transaction_similarity

# Display the combined similarity matrix
print(combined_similarity)


[[ 1.          0.10902235  0.16844682 ...  0.10407934  0.43396651
  -0.79053828]
 [ 0.10902235  1.          0.83925354 ...  0.68493468  0.83067299
  -0.32521147]
 [ 0.16844682  0.83925354  1.         ...  0.59753522  0.83231838
  -0.29419626]
 ...
 [ 0.10407934  0.68493468  0.59753522 ...  1.          0.65863525
  -0.42753571]
 [ 0.43396651  0.83067299  0.83231838 ...  0.65863525  1.
  -0.66846639]
 [-0.79053828 -0.32521147 -0.29419626 ... -0.42753571 -0.66846639
   1.        ]]


In [13]:
# For a given customer (e.g., CustomerID 123), find the 3 most similar customers
customer_id = 'C0001' # Changed this line to enclose C0001 in quotes

# Check if the customer exists in the DataFrame
if customer_id in customer_profile['CustomerID'].values:
    # Get the index of the given customer in the similarity matrix
    customer_idx = customer_profile[customer_profile['CustomerID'] == customer_id].index[0]

    # Get the similarity scores for the given customer
    similarities = combined_similarity[customer_idx]

    # Get the indices of the 3 most similar customers
    top_3_similar_customers = similarities.argsort()[-4:-1]  # Exclude the customer itself

    # Display the top 3 similar customers
    recommended_customers = customer_profile.iloc[top_3_similar_customers]
    print(recommended_customers[['CustomerID', 'Region', 'total_spend']])
else:
    print(f"Customer with ID {customer_id} not found in the dataset.")

    CustomerID         Region  total_spend
54       C0055  North America    -0.333486
156      C0157  North America    -0.870435
68       C0069         Europe    -0.321993


In [15]:
# Assuming combined similarity matrix
final_similarity = pd.DataFrame({
    'CustomerID': ['C0002','C0022','C0033','C0044'], # Enclose customer IDs in quotes to treat them as strings
    'SimilarityScore': [0.85, 0.78, 0.92, 0.65]
})

# Sort customers by similarity score (descending)
top_3_similar_customers = final_similarity.sort_values(by='SimilarityScore', ascending=False).head(3)
print(top_3_similar_customers)

  CustomerID  SimilarityScore
2      C0033             0.92
0      C0002             0.85
1      C0022             0.78
