# Lookalike Model
### Data Science Assignment: eCommerce Transactions Dataset
Author: Saniga Babu  
Date: 27-01-2025


In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
customers = pd.read_csv(r"C:\Users\HP\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\HP\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Users\HP\Downloads\Transactions.csv")

# Preview data
print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [18]:
# Load datasets
data = pd.merge(transactions, products, on='ProductID')
data = pd.merge(data, customers, on='CustomerID')

# Preview merged data
data.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00761,C0199,P022,2024-10-01 05:57:09,4,550.16,137.54,HomeSense Wall Art,Home Decor,137.54,Andrea Jenkins,Europe,2022-12-03
2,T00626,C0199,P079,2024-08-17 12:06:08,2,834.74,417.37,ActiveWear Rug,Home Decor,417.37,Andrea Jenkins,Europe,2022-12-03
3,T00963,C0199,P008,2024-10-26 00:01:58,2,293.7,146.85,BookWorld Bluetooth Speaker,Electronics,146.85,Andrea Jenkins,Europe,2022-12-03
4,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04


In [13]:
customer_profiles = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price_x': 'mean'  # Use 'Price_x' or 'Price_y' based on your inspection
}).reset_index()

print(customer_profiles.head())


  CustomerID  TotalValue  Quantity     Price_x
0      C0001     3354.52        12  278.334000
1      C0002     1862.74        10  208.920000
2      C0003     2725.38        14  195.707500
3      C0004     5354.88        23  240.636250
4      C0005     2034.24         7  291.603333


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity
similarity_matrix = cosine_similarity(customer_profiles.iloc[:, 1:])

# Map customer IDs to indices
customer_ids = customer_profiles['CustomerID'].tolist()


In [15]:
# Generate top 3 lookalikes for each customer
lookalikes = {}
for i, customer_id in enumerate(customer_ids[:20]):
    similar_indices = similarity_matrix[i].argsort()[-4:-1][::-1]
    similar_customers = [(customer_ids[j], similarity_matrix[i][j]) for j in similar_indices]
    lookalikes[customer_id] = similar_customers

print(lookalikes)


{'C0001': [('C0024', 0.9999996421706187), ('C0189', 0.9999996375134669), ('C0107', 0.9999992989732556)], 'C0002': [('C0129', 0.9999988566886444), ('C0019', 0.9999978562049315), ('C0076', 0.9999967196606208)], 'C0003': [('C0179', 0.9999987423691975), ('C0190', 0.9999977883098434), ('C0064', 0.9999975694094638)], 'C0004': [('C0045', 0.9999997737504512), ('C0143', 0.9999996636664495), ('C0087', 0.9999995363353025)], 'C0005': [('C0132', 0.9999943210266752), ('C0089', 0.9999924734240622), ('C0192', 0.9999862999324752)], 'C0006': [('C0152', 0.9999999425199014), ('C0011', 0.9999997883205541), ('C0183', 0.9999993579540706)], 'C0007': [('C0085', 0.9999996094049464), ('C0061', 0.9999995470516185), ('C0192', 0.9999970571835672)], 'C0008': [('C0162', 0.9999995343196986), ('C0018', 0.9999991527968856), ('C0182', 0.9999991503182171)], 'C0009': [('C0020', 0.9999999960241247), ('C0080', 0.9999999388154599), ('C0015', 0.9997230840730107)], 'C0010': [('C0047', 0.9999968898355767), ('C0030', 0.9999965583

In [16]:
import csv

with open('FirstName_LastName_Lookalike.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'Lookalikes'])
    for key, value in lookalikes.items():
        writer.writerow([key, value])
