In [1]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')
transactions=transactions.drop("Price", axis='columns')
# Merge data for comprehensive analysis
data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [3]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,CustomerName,Region,SignupDate,ProductName,Category,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   CustomerName     1000 non-null   object 
 7   Region           1000 non-null   object 
 8   SignupDate       1000 non-null   object 
 9   ProductName      1000 non-null   object 
 10  Category         1000 non-null   object 
 11  Price            1000 non-null   float64
dtypes: float64(2), int64(1), object(9)
memory usage: 93.9+ KB


In [5]:
# Feature Engineering
# Aggregate purchase data
customer_features = data.groupby('CustomerID').agg({
    'Price': 'sum',  # Total spending
    'TransactionID': 'count',  # Purchase frequency
    'Category': lambda x: x.mode()[0]  # Favorite category
}).reset_index()

In [6]:
# Encode categorical data (e.g., region, category)
customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on='CustomerID')
customer_features = pd.get_dummies(customer_features, columns=['Region', 'Category'])

In [7]:
# Normalize features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

In [8]:
# Compute Cosine Similarity
similarity_matrix = cosine_similarity(normalized_features)

In [9]:
# Generate Lookalike Recommendations
lookalikes = {}
for idx, customer_id in enumerate(customer_features['CustomerID']):
    # Get similarity scores for the customer
    scores = list(enumerate(similarity_matrix[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)  # Sort by similarity
    top_3 = [(customer_features.iloc[i]['CustomerID'], score) for i, score in scores[1:4]]  # Exclude self
    lookalikes[customer_id] = top_3


In [10]:
# Save Lookalike List to CSV
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalikes.keys()),
    'LookalikeList': [str(v) for v in lookalikes.values()]
})
lookalike_df.to_csv('Rishik_Suddapalli_Lookalike.csv', index=False)
