<a href="https://colab.research.google.com/github/Parthib-toppo/eCommerce-Transactions-Dataset/blob/main/Parthib_Toppo_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Data Science Assignment: eCommerce Transactions Dataset

# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA

# Loading the datasets
customers_url = 'https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE'
products_url = 'https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0'
transactions_url = 'https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF'

customers = pd.read_csv(customers_url)
products = pd.read_csv(products_url)
transactions = pd.read_csv(transactions_url)

# Previewing the datasets
print("Customers Dataset:\n", customers.head(), "\n")
print("Products Dataset:\n", products.head(), "\n")
print("Transactions Dataset:\n", transactions.head(), "\n")


# Task 2: Lookalike Model
# Preprocessing data for similarity calculations
customer_transactions = transactions.groupby('CustomerID').agg({'TotalValue': 'sum', 'Quantity': 'sum'}).reset_index()
customer_profiles = pd.merge(customer_transactions, customers, on='CustomerID', how='left')

# Encoding categorical data (e.g., Region)
customer_profiles_encoded = pd.get_dummies(customer_profiles, columns=['Region'], drop_first=True)

# Standardizing the numerical features
scaler = StandardScaler()
customer_profiles_scaled = scaler.fit_transform(customer_profiles_encoded.drop(columns=['CustomerID', 'CustomerName', 'SignupDate']))

# Calculating similarity matrix
similarity_matrix = cosine_similarity(customer_profiles_scaled)

# Function to find top 3 similar customers for a given customer
def get_top_3_similar(customers_df, similarity_matrix, customer_index):
    similarity_scores = list(enumerate(similarity_matrix[customer_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = similarity_scores[1:4]  # Exclude the customer itself (index 0)
    return [(customers_df.iloc[i[0]].CustomerID, i[1]) for i in top_3]

# Generating Lookalike.csv for the first 20 customers
lookalike_results = {}
for i in range(20):
    customer_id = customer_profiles.iloc[i].CustomerID
    top_3_similars = get_top_3_similar(customer_profiles, similarity_matrix, i)
    lookalike_results[customer_id] = top_3_similars

# Saving lookalike results to CSV
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_results.keys(),
    'Lookalikes': [str(value) for value in lookalike_results.values()]
})

lookalike_df.to_csv('Lookalike.csv', index=False)
print("Lookalike model results saved to Lookalike.csv")

Customers Dataset:
   CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15 

Products Dataset:
   ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31 

Transactions Dataset:
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T001