In [24]:
import pandas as pd
import numpy as np

In [25]:
customers=pd.read_csv("/content/Customers.csv")
products=pd.read_csv("/content/Products.csv")
transactions=pd.read_csv("/content/Transactions.csv")

In [26]:
customer_transactions=pd.merge(customers,transactions,on="CustomerID",how="inner")
data=pd.merge(customer_transactions,products,on="ProductID",how="inner")

In [27]:
data.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,C0001,Lawrence Carroll,South America,2022-07-10,T00015,P054,2024-01-19 03:12:55,2,114.6,57.3,SoundWave Cookbook,Books,57.3
1,C0001,Lawrence Carroll,South America,2022-07-10,T00932,P022,2024-09-17 09:01:18,3,412.62,137.54,HomeSense Wall Art,Home Decor,137.54
2,C0001,Lawrence Carroll,South America,2022-07-10,T00085,P096,2024-04-08 00:01:00,2,614.94,307.47,SoundWave Headphones,Electronics,307.47
3,C0001,Lawrence Carroll,South America,2022-07-10,T00445,P083,2024-05-07 03:11:44,2,911.44,455.72,ActiveWear Smartwatch,Electronics,455.72
4,C0001,Lawrence Carroll,South America,2022-07-10,T00436,P029,2024-11-02 17:04:16,3,1300.92,433.64,TechPro Headphones,Electronics,433.64


In [28]:
data.columns

Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate', 'TransactionID',
       'ProductID', 'TransactionDate', 'Quantity', 'TotalValue', 'Price_x',
       'ProductName', 'Category', 'Price_y'],
      dtype='object')

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerID       1000 non-null   object 
 1   CustomerName     1000 non-null   object 
 2   Region           1000 non-null   object 
 3   SignupDate       1000 non-null   object 
 4   TransactionID    1000 non-null   object 
 5   ProductID        1000 non-null   object 
 6   TransactionDate  1000 non-null   object 
 7   Quantity         1000 non-null   int64  
 8   TotalValue       1000 non-null   float64
 9   Price_x          1000 non-null   float64
 10  ProductName      1000 non-null   object 
 11  Category         1000 non-null   object 
 12  Price_y          1000 non-null   float64
dtypes: float64(3), int64(1), object(9)
memory usage: 101.7+ KB


In [30]:
data.isnull().sum()

Unnamed: 0,0
CustomerID,0
CustomerName,0
Region,0
SignupDate,0
TransactionID,0
ProductID,0
TransactionDate,0
Quantity,0
TotalValue,0
Price_x,0


In [31]:
customer_profile = data.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "Quantity": "sum",
    "Price_x": "mean",    # Average transaction price
    "Price_y": "mean",    # Average product price
}).reset_index()

In [32]:
customer_profile.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,Price_x,Price_y
0,C0001,3354.52,12,278.334,278.334
1,C0002,1862.74,10,208.92,208.92
2,C0003,2725.38,14,195.7075,195.7075
3,C0004,5354.88,23,240.63625,240.63625
4,C0005,2034.24,7,291.603333,291.603333


In [33]:
customer_profile.shape

(199, 5)

In [34]:
from sklearn.preprocessing import StandardScaler

In [35]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profile.iloc[:, 1:])

In [36]:
scaled_features

array([[-6.17014282e-02, -1.22032964e-01,  9.46702248e-02,
         9.46702248e-02],
       [-8.77743532e-01, -4.48000209e-01, -9.04015921e-01,
        -9.04015921e-01],
       [-4.05857221e-01,  2.03934282e-01, -1.09410928e+00,
        -1.09410928e+00],
       [ 1.03254704e+00,  1.67078689e+00, -4.47701928e-01,
        -4.47701928e-01],
       [-7.83928612e-01, -9.36951078e-01,  2.85581271e-01,
         2.85581271e-01],
       [ 4.15879421e-01, -1.22032964e-01,  8.67287344e-01,
         8.67287344e-01],
       [-4.85482293e-01, -7.73967455e-01,  1.12604452e+00,
         1.12604452e+00],
       [ 4.39970436e-01,  1.18183602e+00, -5.84040413e-01,
        -5.84040413e-01],
       [-1.40630171e+00, -1.58888557e+00,  3.89602086e-01,
         3.89602086e-01],
       [-9.57166204e-01, -1.22032964e-01, -1.70467406e+00,
        -1.70467406e+00],
       [ 1.43695808e-01,  4.09506590e-02,  3.00323548e-01,
         3.00323548e-01],
       [ 9.64923716e-01,  1.01885240e+00, -1.37449092e-02,
      

In [37]:
from sklearn.neighbors import NearestNeighbors
import time
start_time = time.time()

In [38]:
knn = NearestNeighbors(n_neighbors=4, metric="euclidean")
knn.fit(scaled_features)

In [39]:
lookalike_results = {}
for idx, customer_id in enumerate(customer_profile["CustomerID"]):
    distances,indices = knn.kneighbors([scaled_features[idx]])
    lookalikes = [
        (customer_profile.iloc[indices[0][i]]["CustomerID"], round(distances[0][i], 2))
        for i in range(1, len(indices[0]))
    ]
    lookalike_results[customer_id] = lookalikes

In [40]:
lookalike_df = pd.DataFrame([
    {"CustomerID": customer_id, "Lookalikes": lookalikes}
    for customer_id, lookalikes in lookalike_results.items()
])

end_time = time.time()
print(f"Lookalike Model Runtime: {end_time - start_time:.2f} seconds")

Lookalike Model Runtime: 0.25 seconds


In [41]:
print(lookalike_df.head(20))

   CustomerID                                     Lookalikes
0       C0001  [(C0070, 0.13), (C0137, 0.27), (C0191, 0.27)]
1       C0002  [(C0029, 0.05), (C0172, 0.19), (C0157, 0.26)]
2       C0003  [(C0038, 0.19), (C0072, 0.35), (C0176, 0.39)]
3       C0004  [(C0068, 0.49), (C0041, 0.52), (C0113, 0.53)]
4       C0005   [(C0061, 0.17), (C0192, 0.2), (C0167, 0.23)]
5       C0006    [(C0026, 0.5), (C0184, 0.5), (C0048, 0.53)]
6       C0007  [(C0085, 0.16), (C0089, 0.21), (C0146, 0.27)]
7       C0008   [(C0084, 0.2), (C0017, 0.33), (C0039, 0.44)]
8       C0009  [(C0020, 0.18), (C0128, 0.18), (C0130, 0.23)]
9       C0010  [(C0094, 0.58), (C0144, 0.72), (C0142, 0.87)]
10      C0011    [(C0064, 0.17), (C0139, 0.2), (C0024, 0.2)]
11      C0012  [(C0093, 0.05), (C0101, 0.24), (C0182, 0.35)]
12      C0013  [(C0114, 0.23), (C0051, 0.39), (C0148, 0.42)]
13      C0014  [(C0097, 0.48), (C0032, 0.69), (C0083, 0.82)]
14      C0015  [(C0080, 0.45), (C0089, 0.91), (C0085, 0.93)]
15      C0016   [(C0040,

In [42]:
lookalike_df=lookalike_df.head(20)
lookalike_df.to_csv("Veerendra_Rokkam_Lookalike.csv", index=False)

In [43]:
lookalike_data=pd.read_csv("/content/FirstName_LastName_Lookalike.csv")

In [44]:
lookalike_data.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0070', 0.13), ('C0137', 0.27), ('C0191', 0..."
1,C0002,"[('C0029', 0.05), ('C0172', 0.19), ('C0157', 0..."
2,C0003,"[('C0038', 0.19), ('C0072', 0.35), ('C0176', 0..."
3,C0004,"[('C0068', 0.49), ('C0041', 0.52), ('C0113', 0..."
4,C0005,"[('C0061', 0.17), ('C0192', 0.2), ('C0167', 0...."


In [45]:
lookalike_data.shape

(20, 2)

In [46]:
lookalike_data.iloc[3]

Unnamed: 0,3
CustomerID,C0004
Lookalikes,"[('C0068', 0.49), ('C0041', 0.52), ('C0113', 0..."
