In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler


customers = pd.read_csv("D:/Users/Vipra Nagaich/Downloads/Customers.csv")
products = pd.read_csv("D:/Users/Vipra Nagaich/Downloads/Products.csv")
transactions = pd.read_csv("D:/Users/Vipra Nagaich/Downloads/Transactions.csv")

label_encoder = LabelEncoder()

customers['Region_Encoded'] = label_encoder.fit_transform(customers['Region'])
products['Category_Encoded'] = label_encoder.fit_transform(products['Category'])


merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')


merged_data['SignupDate'] = pd.to_datetime(merged_data['SignupDate'])
merged_data['TransactionDate'] = pd.to_datetime(merged_data['TransactionDate'])

merged_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,Region_Encoded,ProductName,Category,Price_y,Category_Encoded
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,1,ComfortLiving Bluetooth Speaker,Electronics,300.68,2
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,0,ComfortLiving Bluetooth Speaker,Electronics,300.68,2
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,1,ComfortLiving Bluetooth Speaker,Electronics,300.68,2
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,3,ComfortLiving Bluetooth Speaker,Electronics,300.68,2
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,1,ComfortLiving Bluetooth Speaker,Electronics,300.68,2


In [None]:
# Feature 1: Total spending per customer
total_spending = merged_data.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spending.columns = ['CustomerID', 'TotalSpending']

# Feature 2: Number of transactions per customer
num_transactions = merged_data.groupby('CustomerID')['TransactionID'].count().reset_index()
num_transactions.columns = ['CustomerID', 'NumTransactions']

# Feature 3: Average spending per transaction
avg_spending = merged_data.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_spending.columns = ['CustomerID', 'AvgSpending']

# Feature 4: Most purchased category
most_purchased_category = merged_data.groupby(['CustomerID', 'Category_Encoded']).size().reset_index(name='Count')
most_purchased_category = most_purchased_category.loc[most_purchased_category.groupby('CustomerID')['Count'].idxmax()]
most_purchased_category = most_purchased_category[['CustomerID', 'Category_Encoded']]
most_purchased_category.columns = ['CustomerID', 'MostPurchasedCategory']

customer_features = pd.merge(total_spending, num_transactions, on='CustomerID')
customer_features = pd.merge(customer_features, avg_spending, on='CustomerID')
customer_features = pd.merge(customer_features, most_purchased_category, on='CustomerID')

customer_features.head()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

# Calculating cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Converting similarity matrix to a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

print(similarity_df.head())

In [4]:
lookalike_map = {}
for i, customer_id in enumerate(customer_features['CustomerID'][:20]):  # First 20 customers
    similarity_scores = similarity_matrix[i]
    top_indices = np.argsort(similarity_scores)[-4:-1]  # Exclude self and get top 3
    lookalike_map[customer_id] = [(customer_features.iloc[idx]['CustomerID'], similarity_scores[idx]) for idx in top_indices]

In [5]:
lookalike_df = pd.DataFrame(lookalike_map.items(), columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv('Lookalike.csv', index=False)