In [37]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [38]:
# Load the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')


In [39]:
# Preview the datasets
print(customers_df.head())
print(products_df.head())
print(transactions_df.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [55]:
# Merge transactions with customer and product information
merged_df = transactions_df.merge(customers_df, on='CustomerID').merge(products_df, on='ProductID')

In [56]:
# Feature engineering: Aggregate features for each customer
customer_features = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',        
    'Quantity': 'sum',          
    'ProductID': 'nunique'       # Number of unique products bought
}).reset_index()

In [42]:
# Merge additional customer demographic data
customer_features = customer_features.merge(customers_df[['CustomerID', 'Region']], on='CustomerID')

In [43]:
# Preview the new customer feature table
print(customer_features.head())

  CustomerID  TotalValue  Quantity  ProductID         Region
0      C0001     3354.52        12          5  South America
1      C0002     1862.74        10          4           Asia
2      C0003     2725.38        14          4  South America
3      C0004     5354.88        23          8  South America
4      C0005     2034.24         7          3           Asia


In [44]:
# Normalize the features to bring them to the same scale
scaler = StandardScaler()
customer_features[['TotalValue', 'Quantity', 'ProductID']] = scaler.fit_transform(customer_features[['TotalValue', 'Quantity', 'ProductID']])

In [45]:
# Preview normalized data
print(customer_features.head())

  CustomerID  TotalValue  Quantity  ProductID         Region
0      C0001   -0.061701 -0.122033   0.050047  South America
1      C0002   -0.877744 -0.448000  -0.424204           Asia
2      C0003   -0.405857  0.203934  -0.424204  South America
3      C0004    1.032547  1.670787   1.472798  South America
4      C0005   -0.783929 -0.936951  -0.898455           Asia


In [46]:
# Calculate cosine similarity between customers based on their features
cosine_sim = cosine_similarity(customer_features[['TotalValue', 'Quantity', 'ProductID']])

In [47]:
# Preview the cosine similarity matrix
print(cosine_sim[:5, :5]) 

[[ 1.          0.5607095  -0.23287112 -0.54238771  0.53321413]
 [ 0.5607095   1.          0.66710809 -0.86548378  0.91517034]
 [-0.23287112  0.66710809  1.         -0.46082499  0.53923518]
 [-0.54238771 -0.86548378 -0.46082499  1.         -0.99336347]
 [ 0.53321413  0.91517034  0.53923518 -0.99336347  1.        ]]


In [48]:
# Function to get the top 3 lookalikes for a given customer
def get_top_lookalikes(customer_id, top_n=3):
    idx = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_lookalikes = [(customer_features['CustomerID'][i[0]], i[1]) for i in similarity_scores[1:top_n+1]]  # Exclude self similarity
    return top_lookalikes

In [49]:
# Get lookalikes for the first 20 customers (C0001 - C0020)
lookalike_data = []
for customer_id in customer_features['CustomerID'][:20]:
    lookalikes = get_top_lookalikes(customer_id)
    for lookalike in lookalikes:
        lookalike_data.append({'CustomerID': customer_id, 'LookalikeID': lookalike[0], 'SimilarityScore': lookalike[1]})

In [50]:
# Convert the results to a DataFrame
lookalike_df = pd.DataFrame(lookalike_data)

In [51]:
# Preview the lookalike recommendations
print(lookalike_df.head())

  CustomerID LookalikeID  SimilarityScore
0      C0001       C0164         0.968410
1      C0001       C0137         0.962081
2      C0001       C0069         0.955071
3      C0002       C0029         0.999762
4      C0002       C0031         0.999013


In [52]:
# Save the Lookalike recommendations to a CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

In [53]:
# Preview the saved file
lookalike_df.head()

Unnamed: 0,CustomerID,LookalikeID,SimilarityScore
0,C0001,C0164,0.96841
1,C0001,C0137,0.962081
2,C0001,C0069,0.955071
3,C0002,C0029,0.999762
4,C0002,C0031,0.999013
