In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Loading the datasets
customers = pd.read_csv('/content/drive/MyDrive/Zeotap_Internship/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/Zeotap_Internship/Products.csv')
transactions = pd.read_csv('/content/drive/MyDrive/Zeotap_Internship/Transactions.csv')

In [4]:
print("Customers Dataset Info:")
print(customers.info(), "\n")

print("Products Dataset Info:")
print(products.info(), "\n")

print("Transactions Dataset Info:")
print(transactions.info(), "\n")

Customers Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
None 

Products Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.3+ KB
None 

Transactions Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #  

In [5]:
# Converting date columns to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [6]:
# Merging datasets
data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [7]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


# **Task 2: Lookalike Model**

In [8]:
# Aggregating transaction data for each customer
customer_profiles = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',   # Total quantity purchased
    'Price_y': 'mean',     # Average product price
    'Category': lambda x: x.mode()[0],  # Most purchased category
    'Region': 'first',   # Customer region
}).reset_index()

customer_profiles.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,Price_y,Category,Region
0,C0001,3354.52,12,278.334,Electronics,South America
1,C0002,1862.74,10,208.92,Clothing,Asia
2,C0003,2725.38,14,195.7075,Home Decor,South America
3,C0004,5354.88,23,240.63625,Books,South America
4,C0005,2034.24,7,291.603333,Electronics,Asia


In [9]:
# Preprocessing data

# Separating numerical and categorical features
numerical_features = ['TotalValue', 'Quantity', 'Price_y']
categorical_features = ['Category', 'Region']

# Creating Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)


In [10]:
# Transforming customer profiles
print(customer_profiles[numerical_features + categorical_features])
processed_data = preprocessor.fit_transform(customer_profiles[numerical_features + categorical_features])
print(processed_data)

     TotalValue  Quantity     Price_y     Category         Region
0       3354.52        12  278.334000  Electronics  South America
1       1862.74        10  208.920000     Clothing           Asia
2       2725.38        14  195.707500   Home Decor  South America
3       5354.88        23  240.636250        Books  South America
4       2034.24         7  291.603333  Electronics           Asia
..          ...       ...         ...          ...            ...
194     4982.88        12  416.992500   Home Decor         Europe
195     1928.65         9  227.056667  Electronics         Europe
196      931.83         3  239.705000     Clothing         Europe
197     1979.28         9  250.610000  Electronics         Europe
198     4758.60        16  296.506000     Clothing           Asia

[199 rows x 5 columns]
[[-0.06170143 -0.12203296  0.09467022 ...  0.          0.
   1.        ]
 [-0.87774353 -0.44800021 -0.90401592 ...  0.          0.
   0.        ]
 [-0.40585722  0.20393428 -1.09410928 

In [11]:
# Calculating similarity matrix
similarity_matrix = cosine_similarity(processed_data)
print(f"Similarity Matrix for C0001 : {similarity_matrix[0]}")

Similarity Matrix for C0001 : [ 1.          0.00838712  0.34129799  0.19687567  0.441957    0.43878575
  0.42680383 -0.08027951  0.08645164 -0.02541414  0.49021105  0.28721676
  0.27132972  0.0359193   0.09663267  0.06270498 -0.08712796 -0.06657444
  0.01503272  0.35970411 -0.03566181 -0.03262233  0.02440815  0.00261554
  0.31668249  0.48401383 -0.00291163  0.16255321  0.00866275 -0.01722565
  0.41842882  0.28637739  0.00789381 -0.06393862  0.44046967  0.03178808
 -0.00644276 -0.02962671  0.67035271  0.06228761 -0.08441993  0.09188664
  0.02796296  0.08348724  0.19043958 -0.05764204 -0.00380435  0.79214675
 -0.05969456  0.37197223  0.31656224  0.45717141 -0.06877472  0.14034352
  0.43332028  0.03440703 -0.04870382  0.10408883 -0.03988644  0.00222325
  0.07425855  0.02727029  0.04140514  0.00317251 -0.08278658  0.35429595
 -0.01244571  0.20131923  0.50121006  0.01877682  0.31074227  0.38290695
  0.02995856  0.07958139 -0.09886555  0.47692395  0.28153442  0.10165757
  0.03872884  0.09541

In [12]:
# Getting top 3 similar customers for each customer

lookalike_map = {}
for i, customer_id in enumerate(customer_profiles['CustomerID']):
    similarity_scores = similarity_matrix[i]       # Getting similarity scores for this customer
    similar_customers = []
    for j in range(len(similarity_scores)):
        if j != i:  # Excluding the current customer
            similar_customers.append((customer_profiles['CustomerID'][j], similarity_scores[j])) #Appending other customer profiles and scores

    similar_customers.sort(key=lambda x: x[1], reverse=True)  # Sorting by similarity score in descending order
    top_3_customers = similar_customers[:3]                   # Get top 3 similar customers
    lookalike_map[customer_id] = top_3_customers              # Store the result in the map

print(lookalike_map)

{'C0001': [('C0181', 0.9346517961588328), ('C0120', 0.8975310829300924), ('C0184', 0.8640730287089359)], 'C0002': [('C0088', 0.9869774905277904), ('C0106', 0.9002251655992599), ('C0134', 0.8263011297911612)], 'C0003': [('C0031', 0.8639478762562338), ('C0025', 0.8621584309391507), ('C0052', 0.8448070519896925)], 'C0004': [('C0165', 0.9738080007914048), ('C0169', 0.9239005861682359), ('C0087', 0.9188289414286034)], 'C0005': [('C0140', 0.9906947166711583), ('C0186', 0.9809645644259003), ('C0146', 0.9271043756247463)], 'C0006': [('C0187', 0.9468086192488325), ('C0126', 0.9445779125594571), ('C0118', 0.9357052336996075)], 'C0007': [('C0146', 0.9964960009625495), ('C0115', 0.9682843726598253), ('C0186', 0.9481301672232154)], 'C0008': [('C0065', 0.8106787250085865), ('C0160', 0.78622738224074), ('C0109', 0.7782116140918641)], 'C0009': [('C0061', 0.9682324770160674), ('C0198', 0.9456198660166513), ('C0103', 0.9194743627131416)], 'C0010': [('C0111', 0.9602659479910558), ('C0062', 0.888374556480

In [13]:
# Generating Lookalike.csv for the first 20 customers
lookalike_data = []
for customer_id in customer_profiles['CustomerID'][:20]:
    similar_customers = lookalike_map[customer_id]
    for similar_customer, score in similar_customers:
        lookalike_data.append({'CustomerID': customer_id, 'SimilarCustomerID': similar_customer, 'SimilarityScore': score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)


print("Top 3 similar customers for first 20 customers saved to 'Lookalike.csv'")
print(lookalike_df)

Top 3 similar customers for first 20 customers saved to 'Lookalike.csv'
   CustomerID SimilarCustomerID  SimilarityScore
0       C0001             C0181         0.934652
1       C0001             C0120         0.897531
2       C0001             C0184         0.864073
3       C0002             C0088         0.986977
4       C0002             C0106         0.900225
5       C0002             C0134         0.826301
6       C0003             C0031         0.863948
7       C0003             C0025         0.862158
8       C0003             C0052         0.844807
9       C0004             C0165         0.973808
10      C0004             C0169         0.923901
11      C0004             C0087         0.918829
12      C0005             C0140         0.990695
13      C0005             C0186         0.980965
14      C0005             C0146         0.927104
15      C0006             C0187         0.946809
16      C0006             C0126         0.944578
17      C0006             C0118         0.9357