<a href="https://colab.research.google.com/github/Swathi014/ZEOTAP-Data-Science-Assignment-eCommerce-Transactions/blob/main/Swathi_P_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [7]:
customers = pd.read_csv('/content/drive/MyDrive/Internship/Zeotap/Dataset/Customers.csv')
transactions = pd.read_csv('/content/drive/MyDrive/Internship/Zeotap/Dataset/Transactions.csv')

In [13]:
# Aggregate transaction data for each customer
customer_transactions = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

print(customer_transactions)

    CustomerID  TotalValue  Quantity
0        C0001     3354.52        12
1        C0002     1862.74        10
2        C0003     2725.38        14
3        C0004     5354.88        23
4        C0005     2034.24         7
..         ...         ...       ...
194      C0196     4982.88        12
195      C0197     1928.65         9
196      C0198      931.83         3
197      C0199     1979.28         9
198      C0200     4758.60        16

[199 rows x 3 columns]


In [12]:
# Merge customer and transaction data
customer_data = pd.merge(customers, customer_transactions, on='CustomerID', how='left').fillna(0)
print(customer_data)

    CustomerID        CustomerName         Region  SignupDate  TotalValue  \
0        C0001    Lawrence Carroll  South America  2022-07-10     3354.52   
1        C0002      Elizabeth Lutz           Asia  2022-02-13     1862.74   
2        C0003      Michael Rivera  South America  2024-03-07     2725.38   
3        C0004  Kathleen Rodriguez  South America  2022-10-09     5354.88   
4        C0005         Laura Weber           Asia  2022-08-15     2034.24   
..         ...                 ...            ...         ...         ...   
195      C0196         Laura Watts         Europe  2022-06-07     4982.88   
196      C0197    Christina Harvey         Europe  2023-03-21     1928.65   
197      C0198         Rebecca Ray         Europe  2022-02-27      931.83   
198      C0199      Andrea Jenkins         Europe  2022-12-03     1979.28   
199      C0200         Kelly Cross           Asia  2023-06-11     4758.60   

     Quantity  
0        12.0  
1        10.0  
2        14.0  
3        23

In [14]:
# Filter the first 20 customers (CustomerID: C0001 - C0020)
customer_data = customer_data[customer_data['CustomerID'].isin([f'C{str(i).zfill(4)}' for i in range(1, 21)])]
print(customer_data)

   CustomerID        CustomerName         Region  SignupDate  TotalValue  \
0       C0001    Lawrence Carroll  South America  2022-07-10     3354.52   
1       C0002      Elizabeth Lutz           Asia  2022-02-13     1862.74   
2       C0003      Michael Rivera  South America  2024-03-07     2725.38   
3       C0004  Kathleen Rodriguez  South America  2022-10-09     5354.88   
4       C0005         Laura Weber           Asia  2022-08-15     2034.24   
5       C0006     Brittany Palmer  South America  2024-01-07     4227.57   
6       C0007         Paul Graves           Asia  2022-06-18     2579.82   
7       C0008            David Li  North America  2024-01-13     4271.61   
8       C0009           Joy Clark         Europe  2023-08-14      896.50   
9       C0010           Aaron Cox         Europe  2022-12-15     1717.55   
10      C0011       Bryan Mathews  South America  2022-12-12     3730.00   
11      C0012           Kevin May  South America  2024-08-07     5231.26   
12      C001

In [16]:
# Select features for the similarity calculation
features = customer_data[['Region', 'TotalValue', 'Quantity']]
features = pd.get_dummies(features, columns=['Region'])  # One-hot encode categorical features
print(features)

    TotalValue  Quantity  Region_Asia  Region_Europe  Region_North America  \
0      3354.52      12.0        False          False                 False   
1      1862.74      10.0         True          False                 False   
2      2725.38      14.0        False          False                 False   
3      5354.88      23.0        False          False                 False   
4      2034.24       7.0         True          False                 False   
5      4227.57      12.0        False          False                 False   
6      2579.82       8.0         True          False                 False   
7      4271.61      20.0        False          False                  True   
8       896.50       3.0        False           True                 False   
9      1717.55      12.0        False           True                 False   
10     3730.00      13.0        False          False                 False   
11     5231.26      19.0        False          False            

In [18]:
# Standardize numerical features for fair similarity computation
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
print(scaled_features)

[[ 0.15512667  0.         -0.42008403 -0.57735027 -0.57735027  1.36277029]
 [-0.75072896 -0.32274861  2.38047614 -0.57735027 -0.57735027 -0.73379939]
 [-0.22690688  0.32274861 -0.42008403 -0.57735027 -0.57735027  1.36277029]
 [ 1.36980802  1.77511737 -0.42008403 -0.57735027 -0.57735027  1.36277029]
 [-0.64658877 -0.80687153  2.38047614 -0.57735027 -0.57735027 -0.73379939]
 [ 0.68527002  0.         -0.42008403 -0.57735027 -0.57735027  1.36277029]
 [-0.31529548 -0.64549722  2.38047614 -0.57735027 -0.57735027 -0.73379939]
 [ 0.71201249  1.29099445 -0.42008403 -0.57735027  1.73205081 -0.73379939]
 [-1.3374602  -1.45236875 -0.42008403  1.73205081 -0.57735027 -0.73379939]
 [-0.83889288  0.         -0.42008403  1.73205081 -0.57735027 -0.73379939]
 [ 0.3831299   0.16137431 -0.42008403 -0.57735027 -0.57735027  1.36277029]
 [ 1.29474208  1.12962014 -0.42008403 -0.57735027 -0.57735027  1.36277029]
 [ 1.76188518  0.96824584 -0.42008403 -0.57735027 -0.57735027  1.36277029]
 [-1.68834278 -1.61374306

In [19]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(scaled_features)
print(similarity_matrix)

[[ 1.00000000e+00 -3.19890725e-01  9.55444206e-01  6.34820228e-01
  -3.08228512e-01  9.55049604e-01 -3.08101653e-01 -3.35124512e-01
  -3.64868545e-01 -4.50333384e-01  9.86408687e-01  7.39279905e-01
   6.93834488e-01 -3.44349774e-01 -3.80283662e-01 -4.10527786e-01
  -3.01961119e-01 -3.28558501e-01 -4.56203304e-01 -3.65306929e-01]
 [-3.19890725e-01  1.00000000e+00 -2.73075432e-01 -3.84460482e-01
   9.84490324e-01 -3.77970216e-01  9.80370803e-01 -3.03601496e-01
   4.45220302e-02 -8.32697978e-02 -3.59466299e-01 -4.08975922e-01
  -4.16364598e-01  7.79863430e-02  2.37135102e-02 -2.39869120e-01
  -3.19858708e-01 -3.27444242e-01 -1.14793634e-01  4.31096975e-02]
 [ 9.55444206e-01 -2.73075432e-01  1.00000000e+00  6.30581762e-01
  -3.03857263e-01  8.45778074e-01 -3.20051748e-01 -2.92997044e-01
  -3.47504323e-01 -3.52841102e-01  9.30501040e-01  6.89679757e-01
   5.95483506e-01 -3.12488175e-01 -3.63928137e-01 -4.81321875e-01
  -2.76163239e-01 -3.39666882e-01 -4.00404930e-01 -3.49174127e-01]
 [ 6.34

In [21]:
# Create a lookalike map
lookalike_map = {}
for i, customer_id in enumerate(customer_data['CustomerID']):
    # Get the top 3 similar customers (excluding the current customer)
    similar_indices = [
        idx for idx in similarity_matrix[i].argsort()[::-1] if idx != i
    ][:3]
    similar_customers = customer_data['CustomerID'].iloc[similar_indices]
    similar_scores = similarity_matrix[i][similar_indices]
    lookalike_map[customer_id] = list(zip(similar_customers, similar_scores))

print(lookalike_map['C0001'])

[('C0011', 0.9864086867552642), ('C0003', 0.9554442059718257), ('C0006', 0.9550496043142768)]


In [22]:
# Convert the map to a DataFrame for CSV output
lookalike_df = pd.DataFrame(list(lookalike_map.items()), columns=['CustomerID', 'Lookalikes'])

In [23]:
# Save to CSV
lookalike_df.to_csv('Swathi_P_Lookalike.csv', index=False)