In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
from google.colab import drive
import pandas as pd

In [3]:
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/Dataset_Zeotap/Combined_dataset.csv'

df = pd.read_csv(file_path)

Mounted at /content/drive


In [4]:
df

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,signup year,signup month,signup day,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,Price,YearMonth,ProductName,Category,DayOfWeek
0,C0001,Lawrence Carroll,South America,2022-07-10,2022,7,10,T00015,P054,2024-01-19 03:12:55,2,114.60,57.30,2024-01,SoundWave Cookbook,Books,Friday
1,C0001,Lawrence Carroll,South America,2022-07-10,2022,7,10,T00932,P022,2024-09-17 09:01:18,3,412.62,137.54,2024-09,HomeSense Wall Art,Home Decor,Tuesday
2,C0001,Lawrence Carroll,South America,2022-07-10,2022,7,10,T00085,P096,2024-04-08 00:01:00,2,614.94,307.47,2024-04,SoundWave Headphones,Electronics,Monday
3,C0001,Lawrence Carroll,South America,2022-07-10,2022,7,10,T00445,P083,2024-05-07 03:11:44,2,911.44,455.72,2024-05,ActiveWear Smartwatch,Electronics,Tuesday
4,C0001,Lawrence Carroll,South America,2022-07-10,2022,7,10,T00436,P029,2024-11-02 17:04:16,3,1300.92,433.64,2024-11,TechPro Headphones,Electronics,Saturday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,C0200,Kelly Cross,Asia,2023-06-11,2023,6,11,T00518,P034,2024-12-11 03:05:50,4,868.20,217.05,2024-12,HomeSense T-Shirt,Clothing,Wednesday
996,C0200,Kelly Cross,Asia,2023-06-11,2023,6,11,T00091,P057,2024-04-27 19:06:20,1,239.70,239.70,2024-04,ActiveWear Smartphone,Electronics,Saturday
997,C0200,Kelly Cross,Asia,2023-06-11,2023,6,11,T00731,P061,2024-07-15 20:36:28,4,627.84,156.96,2024-07,HomeSense Desk Lamp,Home Decor,Monday
998,C0200,Kelly Cross,Asia,2023-06-11,2023,6,11,T00771,P048,2024-09-10 09:50:48,4,1665.60,416.40,2024-09,TechPro Cookbook,Books,Tuesday


In [5]:
#aggregating customer level features
customer_features = df.groupby('CustomerID').agg({
    'Region': 'first',
    'signup year': 'first',
    'signup month': 'first',
    'signup day': 'first',
    'TotalValue': ['mean', 'sum'],
    'Quantity': ['sum', 'mean'],
    'ProductID': lambda x: x.nunique(),  # Number of unique products purchased
    'TransactionID': 'count'  # Frequency of transactions
}).reset_index()

In [6]:
customer_features.columns = ['CustomerID', 'Region', 'SignupYear', 'SignupMonth', 'SignupDay',
                             'AvgTransactionValue', 'TotalTransactionValue', 'TotalQuantity',
                             'AvgQuantity', 'UniqueProducts', 'TransactionFrequency']

In [7]:
# Encode Categorical Data (e.g., Region)
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

In [8]:
# Scale the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))

In [9]:
# Calculate Similarity
similarity_matrix = cosine_similarity(scaled_features)

In [10]:
# Generate Recommendations
lookalike_results = {}
for idx, customer_id in enumerate(customer_features['CustomerID']):
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Extract the top 3 lookalikes (excluding the customer themselves)
    top_lookalikes = [
        (customer_features.iloc[i]['CustomerID'], score)
        for i, score in similarity_scores[1:4]
    ]
    lookalike_results[customer_id] = top_lookalikes

In [11]:
# Convert to DataFrame for the required format
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_results.keys()),
    'Lookalikes': [str(v) for v in lookalike_results.values()]
})

In [12]:
# Filter for the first 20 customers
lookalike_df = lookalike_df[lookalike_df['CustomerID'].isin(customer_features['CustomerID'][:20])]

In [13]:
# Save as Lookalike.csv
lookalike_df.to_csv('Pradeep_Kumar_Lookalike.csv', index=False)

print("Lookalike model built and saved to 'Lookalike.csv'.")

Lookalike model built and saved to 'Lookalike.csv'.
