In [1]:
import pandas as pd
import numpy as np

In [18]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [19]:
merged_data = transactions.merge(customers, on='CustomerID', how='left')
merged_data = merged_data.merge(products, on='ProductID', how='left')

In [20]:
merged_data.head(3)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [21]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['YearsSinceSignup'] = (pd.to_datetime('today') - customers['SignupDate']).dt.days / 365

In [22]:
customer_transactions = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductID': lambda x: list(set(x)),
    'Category': lambda x: list(set(x)),
}).reset_index()

In [23]:
region_dummies = pd.get_dummies(customers['Region'], prefix='Region')
customer_profiles = pd.concat([customers[['CustomerID', 'YearsSinceSignup']], region_dummies], axis=1)

In [24]:
final_data = customer_transactions.merge(customer_profiles, on='CustomerID', how='left')

In [25]:
final_data.head(3)

Unnamed: 0,CustomerID,TotalValue,Quantity,ProductID,Category,YearsSinceSignup,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,3354.52,12,"[P054, P083, P096, P029, P022]","[Electronics, Books, Home Decor]",2.553425,False,False,False,True
1,C0002,1862.74,10,"[P095, P071, P004, P019]","[Clothing, Home Decor]",2.956164,True,False,False,False
2,C0003,2725.38,14,"[P006, P002, P035, P025]","[Electronics, Clothing, Home Decor]",0.893151,False,False,False,True


In [11]:
from sklearn.preprocessing import StandardScaler

In [26]:
scaler = StandardScaler()
final_data[['TotalValue', 'Quantity', 'YearsSinceSignup']] = scaler.fit_transform(final_data[['TotalValue', 'Quantity', 'YearsSinceSignup']])

In [27]:
final_data.head(3)

Unnamed: 0,CustomerID,TotalValue,Quantity,ProductID,Category,YearsSinceSignup,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,-0.061701,-0.122033,"[P054, P083, P096, P029, P022]","[Electronics, Books, Home Decor]",1.148752,False,False,False,True
1,C0002,-0.877744,-0.448,"[P095, P071, P004, P019]","[Clothing, Home Decor]",1.600431,True,False,False,False
2,C0003,-0.405857,0.203934,"[P006, P002, P035, P025]","[Electronics, Clothing, Home Decor]",-0.71327,False,False,False,True


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
def calculate_similarity(customer_data):
    transaction_features = customer_data[['TotalValue', 'Quantity', 'YearsSinceSignup']]
    
    region_features = customer_data.drop(columns=['CustomerID', 'TotalValue', 'Quantity', 'YearsSinceSignup'])

    for column in region_features.columns:
        if region_features[column].apply(lambda x: isinstance(x, list)).any():
            region_features[column] = region_features[column].apply(lambda x: str(x) if isinstance(x, list) else x)

    region_features = pd.get_dummies(region_features, drop_first=True)

    region_similarity = cosine_similarity(region_features)

    transaction_similarity = cosine_similarity(transaction_features)

    final_similarity = 0.7 * region_similarity + 0.3 * transaction_similarity

    return final_similarity


In [29]:
lookalike_results = {}

In [32]:
for i in range(len(final_data)):
    customer_id = final_data.iloc[i]['CustomerID']
    
    temp_data = final_data.drop(i)
    
    similarity_scores = calculate_similarity(pd.concat([final_data.iloc[i:i+1], temp_data], axis=0))
    
    top_similarities = np.argsort(similarity_scores[0])[-4:-1]  
    top_customers = final_data.iloc[top_similarities]['CustomerID'].values
    top_scores = similarity_scores[0][top_similarities]
   
    lookalike_results[customer_id] = list(zip(top_customers, top_scores))

In [33]:
lookalike_df = pd.DataFrame(lookalike_results.items(), columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv('Lookalike.csv', index=False)