# Submitted By:- Sagar Purswani (purswanisagar60@gmail.com)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preparation

In [2]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
print("Customers:\n", customers.head())
print("Products:\n", products.head())
print("Transactions:\n", transactions.head())

Customers:
   CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
Products:
   ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
Transactions:
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  202

In [4]:
transactions_products = pd.merge(transactions, products, on="ProductID", how="left")
combined_df = pd.merge(transactions_products, customers, on="CustomerID", how="left")
print(combined_df.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName         Region  SignupDate  
0   Andrea Jenkins         Europe  202

In [5]:
combined_df.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'ProductName', 'Category',
       'Price_y', 'CustomerName', 'Region', 'SignupDate'],
      dtype='object')

## Feature Engineering

In [6]:
from datetime import datetime

total_spending = combined_df.groupby('CustomerID')['TotalValue'].sum().rename("TotalSpending")
avg_transaction_value = combined_df.groupby('CustomerID')['TotalValue'].mean().rename("AvgTransactionValue")
most_purchased_category = (
    combined_df.groupby(['CustomerID', 'Category'])['Quantity'].sum()
    .reset_index()
    .sort_values(['CustomerID', 'Quantity'], ascending=[True, False])
    .drop_duplicates('CustomerID')
    .set_index('CustomerID')['Category']
)
total_transactions = combined_df.groupby('CustomerID').size().rename("TotalTransactions")

customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['DaysSinceSignup'] = (datetime.now() - customers['SignupDate']).dt.days

customer_features = pd.DataFrame({
    'TotalSpending': total_spending,
    'AvgTransactionValue': avg_transaction_value,
    'MostPurchasedCategory': most_purchased_category,
    'TotalTransactions': total_transactions
}).reset_index()

customers = pd.merge(customers, customer_features, on="CustomerID", how="left")

print(customers.head())


  CustomerID        CustomerName         Region SignupDate  DaysSinceSignup  \
0      C0001    Lawrence Carroll  South America 2022-07-10              932   
1      C0002      Elizabeth Lutz           Asia 2022-02-13             1079   
2      C0003      Michael Rivera  South America 2024-03-07              326   
3      C0004  Kathleen Rodriguez  South America 2022-10-09              841   
4      C0005         Laura Weber           Asia 2022-08-15              896   

   TotalSpending  AvgTransactionValue MostPurchasedCategory  TotalTransactions  
0        3354.52              670.904           Electronics                5.0  
1        1862.74              465.685            Home Decor                4.0  
2        2725.38              681.345            Home Decor                4.0  
3        5354.88              669.360            Home Decor                8.0  
4        2034.24              678.080           Electronics                3.0  


## Encoding Categorical Variables

In [7]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
customers['RegionEncoded'] = label_encoder.fit_transform(customers['Region'])
customers['MostPurchasedCategoryEncoded'] = label_encoder.fit_transform(customers['MostPurchasedCategory'].fillna("Unknown"))
print(customers[['Region', 'RegionEncoded', 'MostPurchasedCategory', 'MostPurchasedCategoryEncoded']].head())


          Region  RegionEncoded MostPurchasedCategory  \
0  South America              3           Electronics   
1           Asia              0            Home Decor   
2  South America              3            Home Decor   
3  South America              3            Home Decor   
4           Asia              0           Electronics   

   MostPurchasedCategoryEncoded  
0                             2  
1                             3  
2                             3  
3                             3  
4                             2  


In [8]:
customers

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,DaysSinceSignup,TotalSpending,AvgTransactionValue,MostPurchasedCategory,TotalTransactions,RegionEncoded,MostPurchasedCategoryEncoded
0,C0001,Lawrence Carroll,South America,2022-07-10,932,3354.52,670.904000,Electronics,5.0,3,2
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1079,1862.74,465.685000,Home Decor,4.0,0,3
2,C0003,Michael Rivera,South America,2024-03-07,326,2725.38,681.345000,Home Decor,4.0,3,3
3,C0004,Kathleen Rodriguez,South America,2022-10-09,841,5354.88,669.360000,Home Decor,8.0,3,3
4,C0005,Laura Weber,Asia,2022-08-15,896,2034.24,678.080000,Electronics,3.0,0,2
...,...,...,...,...,...,...,...,...,...,...,...
195,C0196,Laura Watts,Europe,2022-06-07,965,4982.88,1245.720000,Home Decor,4.0,1,3
196,C0197,Christina Harvey,Europe,2023-03-21,678,1928.65,642.883333,Electronics,3.0,1,2
197,C0198,Rebecca Ray,Europe,2022-02-27,1065,931.83,465.915000,Clothing,2.0,1,1
198,C0199,Andrea Jenkins,Europe,2022-12-03,786,1979.28,494.820000,Home Decor,4.0,1,3


# Defining Similarity Metrics
## Using cosine similarity for this task

In [9]:
features = [
    'TotalSpending', 
    'AvgTransactionValue', 
    'TotalTransactions', 
    'DaysSinceSignup', 
    'RegionEncoded', 
    'MostPurchasedCategoryEncoded'
]

customer_similarity_data = customers[features].fillna(0)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
customer_similarity_data[['TotalSpending', 'AvgTransactionValue', 'TotalTransactions', 'DaysSinceSignup']] = scaler.fit_transform(
    customer_similarity_data[['TotalSpending', 'AvgTransactionValue', 'TotalTransactions', 'DaysSinceSignup']]
)

print(customer_similarity_data.head())


   TotalSpending  AvgTransactionValue  TotalTransactions  DaysSinceSignup  \
0       0.314274             0.507057           0.454545         0.842204   
1       0.174514             0.351956           0.363636         0.979458   
2       0.255332             0.514948           0.363636         0.276377   
3       0.501681             0.505890           0.727273         0.757236   
4       0.190581             0.512480           0.272727         0.808590   

   RegionEncoded  MostPurchasedCategoryEncoded  
0              3                             2  
1              0                             3  
2              3                             3  
3              3                             3  
4              0                             2  


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(customer_similarity_data)

similarity_df = pd.DataFrame(similarity_matrix, index=customers['CustomerID'], columns=customers['CustomerID'])
print(similarity_df.iloc[:5, :5])


CustomerID     C0001     C0002     C0003     C0004     C0005
CustomerID                                                  
C0001       1.000000  0.597356  0.967623  0.979271  0.605019
C0002       0.597356  1.000000  0.698636  0.724655  0.989953
C0003       0.967623  0.698636  1.000000  0.989270  0.687307
C0004       0.979271  0.724655  0.989270  1.000000  0.721685
C0005       0.605019  0.989953  0.687307  0.721685  1.000000


In [11]:
top_lookalikes = {}

for customer_id in similarity_df.index:
    
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:4]
    
    top_lookalikes[customer_id] = list(zip(similar_customers.index, similar_customers.values))

for customer, lookalikes in list(top_lookalikes.items())[:5]:
    print(f"Customer {customer} -> Lookalikes: {lookalikes}")


Customer C0001 -> Lookalikes: [('C0192', 0.9986573092048231), ('C0184', 0.9980746798734935), ('C0091', 0.9974118668531504)]
Customer C0002 -> Lookalikes: [('C0159', 0.9968927539941458), ('C0005', 0.9899532246464636), ('C0128', 0.9886935117349448)]
Customer C0003 -> Lookalikes: [('C0052', 0.9999270893834681), ('C0076', 0.9990155565405792), ('C0181', 0.9985061293944394)]
Customer C0004 -> Lookalikes: [('C0108', 0.9997263581455648), ('C0113', 0.9995807567702569), ('C0104', 0.9981972373625077)]
Customer C0005 -> Lookalikes: [('C0007', 0.9979808177603792), ('C0159', 0.9902733274683567), ('C0002', 0.9899532246464636)]


# Saving the Lookalike Recommendations to a CSV File

In [16]:
top_lookalikes_for_first_20 = {key: value for key, value in top_lookalikes.items() if int(key[1:]) >= 1 and int(key[1:]) <= 20}
lookalike_data = []

for customer_id, lookalikes in top_lookalikes_for_first_20.items():
    lookalike_str = ";".join([f"{lookalike[0]}:{lookalike[1]:.4f}" for lookalike in lookalikes])
    lookalike_data.append([customer_id, lookalike_str])

with open('Lookalike.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'Lookalikes']) 
    writer.writerows(lookalike_data)

print("Lookalike.csv file for first 20 customers has been saved.")


Lookalike.csv file for first 20 customers has been saved.


In [17]:
Result = pd.read_csv('Lookalike.csv')

In [18]:
Result

Unnamed: 0,CustomerID,Lookalikes
0,C0001,C0192:0.9987;C0184:0.9981;C0091:0.9974
1,C0002,C0159:0.9969;C0005:0.9900;C0128:0.9887
2,C0003,C0052:0.9999;C0076:0.9990;C0181:0.9985
3,C0004,C0108:0.9997;C0113:0.9996;C0104:0.9982
4,C0005,C0007:0.9980;C0159:0.9903;C0002:0.9900
5,C0006,C0187:0.9971;C0126:0.9957;C0137:0.9946
6,C0007,C0005:0.9980;C0140:0.9871;C0159:0.9817
7,C0008,C0098:0.9913;C0156:0.9892;C0034:0.9875
8,C0009,C0122:0.9943;C0104:0.9924;C0113:0.9911
9,C0010,C0062:0.9971;C0122:0.9883;C0009:0.9882
