In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler



In [2]:
# Load the customers and transaction data
customers = pd.read_csv('Customers.csv')  
transactions = pd.read_csv('Transactions.csv')  

In [3]:
# Preprocess data: Merge transaction and customer data
data = pd.merge(transactions, customers, on='CustomerID')
data

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00761,C0199,P022,2024-10-01 05:57:09,4,550.16,137.54,Andrea Jenkins,Europe,2022-12-03
2,T00626,C0199,P079,2024-08-17 12:06:08,2,834.74,417.37,Andrea Jenkins,Europe,2022-12-03
3,T00963,C0199,P008,2024-10-26 00:01:58,2,293.70,146.85,Andrea Jenkins,Europe,2022-12-03
4,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04
...,...,...,...,...,...,...,...,...,...,...
995,T00774,C0095,P056,2024-01-07 14:19:49,2,32.16,16.08,William Walker,South America,2023-03-04
996,T00823,C0095,P079,2024-09-30 10:45:06,3,1252.11,417.37,William Walker,South America,2023-03-04
997,T00369,C0151,P082,2024-12-24 11:40:24,4,223.96,55.99,Amber Gonzalez,South America,2024-11-22
998,T00809,C0078,P075,2024-12-09 11:44:44,2,995.52,497.76,Julia Palmer,Asia,2024-11-13


In [4]:
# Feature Engineering: Create customer profile
customer_profile = data.groupby('CustomerID').agg({'ProductID': lambda x: list(x),  # List of products purchased
                                                   'Quantity': 'sum',  # Total quantity purchased
                                                   'TotalValue': 'sum',  # Total spending
                                                   'TransactionDate': ['min', 'max'],  # First and last transaction date
                                                   }).reset_index()


customer_profile.columns = ['_'.join(col).strip() for col in customer_profile.columns.values] #Rename columns 

customer_profile = customer_profile.rename(columns={'ProductID_': 'Products_Purchased',
                                                    'Quantity_sum': 'Total_Quantity',
                                                    'TotalValue_sum': 'Total_Spend',
                                                    'TransactionDate_min': 'First_Transaction',
                                                    'TransactionDate_max': 'Last_Transaction'
                                                    })
customer_profile

Unnamed: 0,CustomerID_,ProductID_<lambda>,Total_Quantity,Total_Spend,First_Transaction,Last_Transaction
0,C0001,"[P054, P022, P096, P083, P029]",12,3354.52,2024-01-19 03:12:55,2024-11-02 17:04:16
1,C0002,"[P095, P004, P019, P071]",10,1862.74,2024-02-28 07:44:21,2024-12-03 01:41:41
2,C0003,"[P025, P006, P035, P002]",14,2725.38,2024-02-18 02:50:37,2024-08-24 18:54:04
3,C0004,"[P049, P053, P038, P025, P097, P024, P008, P077]",23,5354.88,2024-02-28 10:16:35,2024-12-23 14:13:52
4,C0005,"[P025, P039, P012]",7,2034.24,2024-03-15 04:08:59,2024-11-04 00:30:22
...,...,...,...,...,...,...
194,C0196,"[P018, P020, P079, P079]",12,4982.88,2024-08-06 14:37:15,2024-12-15 03:43:35
195,C0197,"[P084, P013, P027]",9,1928.65,2024-01-13 04:52:09,2024-12-27 18:20:31
196,C0198,"[P073, P064]",3,931.83,2024-09-29 16:14:59,2024-10-04 18:31:12
197,C0199,"[P067, P022, P079, P008]",9,1979.28,2024-08-17 12:06:08,2024-10-26 00:01:58


In [5]:
 
scaler = StandardScaler()# Normalize numerical data
customer_profile[['Total_Quantity', 'Total_Spend']] = scaler.fit_transform(customer_profile[['Total_Quantity', 'Total_Spend']])

# Create a customer-product matrix for similarity computation
product_matrix = data.groupby(['CustomerID', 'ProductID'])['Quantity'].sum().unstack(fill_value=0)

# Calculate cosine similarity between customers based on transaction and product features
cos_sim = cosine_similarity(product_matrix)

In [6]:
# Get the index of customer C0001
cust_id = 'C0001'
cust_idx = customer_profile[customer_profile['CustomerID_'] == cust_id].index[0]

# Get the similarity scores for customer C0001 with all other customers
similarity_scores = cos_sim[cust_idx]

# Sort the scores in descending order and pick the top 3 similar customers 
similar_customers_idx = similarity_scores.argsort()[-4:-1]  

# Store the top 3 recommendations and their similarity scores
recommendations = [(customer_profile.iloc[idx]['CustomerID_'], similarity_scores[idx]) for idx in similar_customers_idx]

# Display the recommendations for customer C0001
for rec in recommendations:
    print(f"Recommended Customer: {rec[0]}, Similarity Score: {rec[1]}")


Recommended Customer: C0199, Similarity Score: 0.4381780460041329
Recommended Customer: C0194, Similarity Score: 0.469668218313862
Recommended Customer: C0097, Similarity Score: 0.5477225575051661


In [7]:

lookalike_data = []

# Loop through the first 20 customers
for cust_id in ['C0001', 'C0002', 'C0003', 'C0004', 'C0005', 'C0006', 'C0007', 'C0008', 'C0009', 'C0010',
                'C0011', 'C0012', 'C0013', 'C0014', 'C0015', 'C0016', 'C0017', 'C0018', 'C0019', 'C0020']:

    
    cust_idx = customer_profile[customer_profile['CustomerID_'] == cust_id].index[0]
    similarity_scores = cos_sim[cust_idx]
    similar_customers_idx = similarity_scores.argsort()[-4:-1]  # Exclude the current customer itself
    recommendations = [(customer_profile.iloc[idx]['CustomerID_'], round(similarity_scores[idx], 2)) for idx in similar_customers_idx]

    # Create a list of lookalike customer IDs and similarity scores
    lookalike_customers = [rec[0] for rec in recommendations]
    similarity_scores_list = [str(rec[1]) for rec in recommendations]
    
    lookalike_data.append({'CustomerID': cust_id,
                           'Lookalike_Customers': ', '.join(lookalike_customers),
                           'Similarity_Scores': ', '.join(similarity_scores_list)
                           })


lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.head(20)  


Unnamed: 0,CustomerID,Lookalike_Customers,Similarity_Scores
0,C0001,"C0199, C0194, C0097","0.44, 0.47, 0.55"
1,C0002,"C0071, C0091, C0030","0.33, 0.33, 0.37"
2,C0003,"C0144, C0181, C0134","0.4, 0.52, 0.52"
3,C0004,"C0063, C0132, C0070","0.34, 0.44, 0.5"
4,C0005,"C0064, C0055, C0096","0.33, 0.51, 0.65"
5,C0006,"C0178, C0040, C0058","0.37, 0.63, 0.65"
6,C0007,"C0026, C0079, C0020","0.36, 0.5, 0.59"
7,C0008,"C0003, C0088, C0144","0.31, 0.34, 0.39"
8,C0009,"C0062, C0162, C0140","0.5, 0.51, 0.56"
9,C0010,"C0094, C0077, C0033","0.41, 0.42, 0.49"


In [8]:
lookalike_df.to_csv("Vishnupriya_Vijayan_Lookalike.csv", index=False)