In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

In [None]:
customers = pd.read_csv('/content/drive/MyDrive/E-Commerce dataset/Customers.csv')
customers.head(5)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [None]:
products = pd.read_csv('/content/drive/MyDrive/E-Commerce dataset/Products.csv')
products.head(5)

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [None]:
transactions = pd.read_csv('/content/drive/MyDrive/E-Commerce dataset/Transactions.csv')
transactions.head(5)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [None]:
# merging datasets to create a unified dataset

ecomdata = pd.merge(customers, transactions, on = 'CustomerID')
ecomdata = pd.merge(ecomdata, products, on = 'ProductID')

In [None]:
ecomdata.head(5)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,C0001,Lawrence Carroll,South America,2022-07-10,T00015,P054,2024-01-19 03:12:55,2,114.6,57.3,SoundWave Cookbook,Books,57.3
1,C0001,Lawrence Carroll,South America,2022-07-10,T00932,P022,2024-09-17 09:01:18,3,412.62,137.54,HomeSense Wall Art,Home Decor,137.54
2,C0001,Lawrence Carroll,South America,2022-07-10,T00085,P096,2024-04-08 00:01:00,2,614.94,307.47,SoundWave Headphones,Electronics,307.47
3,C0001,Lawrence Carroll,South America,2022-07-10,T00445,P083,2024-05-07 03:11:44,2,911.44,455.72,ActiveWear Smartwatch,Electronics,455.72
4,C0001,Lawrence Carroll,South America,2022-07-10,T00436,P029,2024-11-02 17:04:16,3,1300.92,433.64,TechPro Headphones,Electronics,433.64


In [None]:
customer_features = ecomdata.groupby('CustomerID').agg({
    'TransactionID' : 'count',
    'Price_x' : 'sum',
    'Category' : lambda x: x.nunique()
}).rename(columns = {
    'TransactionID' : 'num_transactions',
    'Price_x' : 'total_spent',
    'Category' : 'num_categories'
})

customer_data = pd.merge(customers, customer_features, on = 'CustomerID')

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_data[['num_transactions', 'total_spent', 'num_categories']])
customer_data[['num_transactions', 'total_spent', 'num_categories']] = normalized_features

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# cosine similarity
similarity_matrix = cosine_similarity(normalized_features)

# converting similarity matrix to dataframe
similarity_df = pd.DataFrame(similarity_matrix, index = customer_data['CustomerID'], columns = customer_data['CustomerID'])

In [None]:
first_20_customers = customer_data.head(20)
first_20_customer_IDs = first_20_customers['CustomerID'].tolist()

In [None]:
def find_top_three_lookalikes(customer_id):
# similarity scores for customers
  similarities = similarity_df[customer_id]

  similar_customers = similarities.sort_values(ascending = False).head(4)

  similar_customers = similar_customers[similar_customers.index != customer_id]

  return similar_customers.head(3)

top_3_lookalikes = {customer_id : find_top_three_lookalikes(customer_id) for customer_id in first_20_customer_IDs}

for customer_id, lookalikes in top_3_lookalikes.items():
  print(f"Top 3 lookalikes for Customer {customer_id}:")
  print(lookalikes, "\n")

Top 3 lookalikes for Customer C0001:
CustomerID
C0149    0.999420
C0200    0.995977
C0170    0.992889
Name: C0001, dtype: float64 

Top 3 lookalikes for Customer C0002:
CustomerID
C0142    0.998451
C0052    0.988320
C0199    0.985385
Name: C0002, dtype: float64 

Top 3 lookalikes for Customer C0003:
CustomerID
C0025    0.999398
C0029    0.999299
C0094    0.995689
Name: C0003, dtype: float64 

Top 3 lookalikes for Customer C0004:
CustomerID
C0047    0.997608
C0084    0.997486
C0108    0.995415
Name: C0004, dtype: float64 

Top 3 lookalikes for Customer C0005:
CustomerID
C0061    0.999999
C0009    0.999816
C0080    0.999221
Name: C0005, dtype: float64 

Top 3 lookalikes for Customer C0006:
CustomerID
C0146    0.998459
C0044    0.987627
C0026    0.982529
Name: C0006, dtype: float64 

Top 3 lookalikes for Customer C0007:
CustomerID
C0186    0.995670
C0078    0.994367
C0131    0.994114
Name: C0007, dtype: float64 

Top 3 lookalikes for Customer C0008:
CustomerID
C0147    0.995132
C0175    0

In [None]:
# prompt: create a csv file consisting top 3 lookalikes for the first 20 customers along with columns = 'Customer ID', 'Similar_customer_1', 'Score_1',  'Similar_customer_2', 'Score_2', 'Similar_customer_3', 'Score_3'

import pandas as pd

# Assuming 'top_3_lookalikes' dictionary is already populated as in the provided code.
# Create an empty list to store the data for the CSV file.



In [None]:
# creating a csv file
data = []

for customer_id, lookalikes in top_3_lookalikes.items():
  # top 3 lookalikes and their scores
  lookalike_data = []
  for i, (lookalike_id, score) in enumerate(lookalikes.items()):
    lookalike_data.extend([lookalike_id, score])
  row = [customer_id] + lookalike_data
  data.append(row)

df = pd.DataFrame(data, columns=['Customer ID', 'Similar_customer_1', 'Score_1', 'Similar_customer_2', 'Score_2', 'Similar_customer_3', 'Score_3'])

df.to_csv('Vijay_Mohan_Lookalike.csv', index=False)

In [None]:
top_csv = pd.read_csv('Vijay_Mohan_Lookalike.csv')
top_csv.head(5)

Unnamed: 0,Customer ID,Similar_customer_1,Score_1,Similar_customer_2,Score_2,Similar_customer_3,Score_3
0,C0001,C0149,0.99942,C0200,0.995977,C0170,0.992889
1,C0002,C0142,0.998451,C0052,0.98832,C0199,0.985385
2,C0003,C0025,0.999398,C0029,0.999299,C0094,0.995689
3,C0004,C0047,0.997608,C0084,0.997486,C0108,0.995415
4,C0005,C0061,0.999999,C0009,0.999816,C0080,0.999221
