# Importing libraries

In [29]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Loading Datasets

In [30]:
Customers = pd.read_csv("D:\DS with Py\Zeotap Task\Customers.csv")
Products = pd.read_csv("D:\DS with Py\Zeotap Task\Products.csv")
Transactions = pd.read_csv("D:\DS with Py\Zeotap Task\Transactions.csv")

# Merge Datasets

In [31]:
merged_data = pd.merge(Transactions, Customers, on='CustomerID', how='inner')
merged_data = pd.merge(merged_data, Products, on='ProductID', how='inner')

In [32]:
merged_data

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00630,C0031,P093,2024-10-08 23:58:14,2,609.88,304.94,Tina Miller,South America,2024-04-11,TechPro Vase,Home Decor,304.94
996,T00672,C0165,P044,2024-07-28 00:09:49,4,75.28,18.82,Juan Mcdaniel,South America,2022-04-09,ActiveWear Running Shoes,Clothing,18.82
997,T00711,C0165,P044,2024-06-11 15:51:14,4,75.28,18.82,Juan Mcdaniel,South America,2022-04-09,ActiveWear Running Shoes,Clothing,18.82
998,T00878,C0165,P044,2024-09-24 21:15:21,3,56.46,18.82,Juan Mcdaniel,South America,2022-04-09,ActiveWear Running Shoes,Clothing,18.82


In [33]:
if 'Price' not in merged_data.columns:
    merged_data['Price'] = merged_data['TotalValue'] / merged_data['Quantity']

# Feature Engineering

In [34]:
# Aggregate transaction Data

In [35]:
Customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  
    'Quantity': 'sum',    
    'Price': 'mean',      
    'Category': lambda x: x.mode()[0]  
}).reset_index()

In [36]:
Customer_features 

Unnamed: 0,CustomerID,TotalValue,Quantity,Price,Category
0,C0001,3354.52,12,278.334000,Electronics
1,C0002,1862.74,10,208.920000,Clothing
2,C0003,2725.38,14,195.707500,Home Decor
3,C0004,5354.88,23,240.636250,Books
4,C0005,2034.24,7,291.603333,Electronics
...,...,...,...,...,...
194,C0196,4982.88,12,416.992500,Home Decor
195,C0197,1928.65,9,227.056667,Electronics
196,C0198,931.83,3,239.705000,Clothing
197,C0199,1979.28,9,250.610000,Electronics


In [37]:
# Categorical Data
Customer_features = pd.merge(Customer_features, Customers[['CustomerID', 'Region']], on='CustomerID')
Customer_features = pd.get_dummies(Customer_features, columns=['Region', 'Category'])

In [38]:
Customer_features

Unnamed: 0,CustomerID,TotalValue,Quantity,Price,Region_Asia,Region_Europe,Region_North America,Region_South America,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,3354.52,12,278.334000,False,False,False,True,False,False,True,False
1,C0002,1862.74,10,208.920000,True,False,False,False,False,True,False,False
2,C0003,2725.38,14,195.707500,False,False,False,True,False,False,False,True
3,C0004,5354.88,23,240.636250,False,False,False,True,True,False,False,False
4,C0005,2034.24,7,291.603333,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
194,C0196,4982.88,12,416.992500,False,True,False,False,False,False,False,True
195,C0197,1928.65,9,227.056667,False,True,False,False,False,False,True,False
196,C0198,931.83,3,239.705000,False,True,False,False,False,True,False,False
197,C0199,1979.28,9,250.610000,False,True,False,False,False,False,True,False


In [40]:
# Scale numerical features for cosine similarity
scaler = StandardScaler()
scaled_features = scaler.fit_transform(Customer_features.drop('CustomerID', axis=1))

In [41]:
scaled_features 

array([[-0.06170143, -0.12203296,  0.09467022, ..., -0.54056248,
         1.84992492, -0.51721942],
       [-0.87774353, -0.44800021, -0.90401592, ...,  1.84992492,
        -0.54056248, -0.51721942],
       [-0.40585722,  0.20393428, -1.09410928, ..., -0.54056248,
        -0.54056248,  1.93341543],
       ...,
       [-1.38697529, -1.58888557, -0.46110018, ...,  1.84992492,
        -0.54056248, -0.51721942],
       [-0.81399315, -0.61098383, -0.30420572, ..., -0.54056248,
         1.84992492, -0.51721942],
       [ 0.70636652,  0.52990153,  0.35611784, ...,  1.84992492,
        -0.54056248, -0.51721942]])

In [43]:
# Compute cosine similarity( Output)
similarity_matrix = cosine_similarity(scaled_features)
similarity_matrix

array([[ 1.        , -0.27858075,  0.23062831, ..., -0.2373934 ,
         0.3660242 , -0.30170922],
       [-0.27858075,  1.        , -0.12880515, ...,  0.46534525,
        -0.12139046,  0.75812147],
       [ 0.23062831, -0.12880515,  1.        , ..., -0.16736918,
        -0.2171534 , -0.32129318],
       ...,
       [-0.2373934 ,  0.46534525, -0.16736918, ...,  1.        ,
         0.44524922,  0.10707049],
       [ 0.3660242 , -0.12139046, -0.2171534 , ...,  0.44524922,
         1.        , -0.35857678],
       [-0.30170922,  0.75812147, -0.32129318, ...,  0.10707049,
        -0.35857678,  1.        ]])

In [45]:
# Create recommendations for first 20 customers
customer_ids = Customer_features['CustomerID'].tolist()
lookalike_map = {}

In [46]:
customer_ids

['C0001',
 'C0002',
 'C0003',
 'C0004',
 'C0005',
 'C0006',
 'C0007',
 'C0008',
 'C0009',
 'C0010',
 'C0011',
 'C0012',
 'C0013',
 'C0014',
 'C0015',
 'C0016',
 'C0017',
 'C0018',
 'C0019',
 'C0020',
 'C0021',
 'C0022',
 'C0023',
 'C0024',
 'C0025',
 'C0026',
 'C0027',
 'C0028',
 'C0029',
 'C0030',
 'C0031',
 'C0032',
 'C0033',
 'C0034',
 'C0035',
 'C0036',
 'C0037',
 'C0038',
 'C0039',
 'C0040',
 'C0041',
 'C0042',
 'C0043',
 'C0044',
 'C0045',
 'C0046',
 'C0047',
 'C0048',
 'C0049',
 'C0050',
 'C0051',
 'C0052',
 'C0053',
 'C0054',
 'C0055',
 'C0056',
 'C0057',
 'C0058',
 'C0059',
 'C0060',
 'C0061',
 'C0062',
 'C0063',
 'C0064',
 'C0065',
 'C0066',
 'C0067',
 'C0068',
 'C0069',
 'C0070',
 'C0071',
 'C0072',
 'C0073',
 'C0074',
 'C0075',
 'C0076',
 'C0077',
 'C0078',
 'C0079',
 'C0080',
 'C0081',
 'C0082',
 'C0083',
 'C0084',
 'C0085',
 'C0086',
 'C0087',
 'C0088',
 'C0089',
 'C0090',
 'C0091',
 'C0092',
 'C0093',
 'C0094',
 'C0095',
 'C0096',
 'C0097',
 'C0098',
 'C0099',
 'C0100',


In [47]:
for i in range(20):  # First 20 customers (index 0 to 19)
    customer_id = customer_ids[i]
    similarities = list(enumerate(similarity_matrix[i]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)  # Sort by similarity score
    top_3_similar = [(customer_ids[j], round(score, 4)) for j, score in similarities[1:4]]  # Skip self-comparison
    lookalike_map[customer_id] = top_3_similar

In [48]:
# Convert lookalike map to DataFrame and save as CSV
lookalike_df = pd.DataFrame([
    {'CustomerID': cust, 'Lookalikes': lookalike_map[cust]} for cust in lookalike_map
])
lookalike_df.to_csv('Lookalike.csv', index=False)

In [52]:
#Output

from sklearn.metrics import accuracy_score

y_true = [1, 0, 1, 1, 0]  # Actual similarity labels (1 for similar, 0 for not similar)
y_pred = [1, 0, 1, 0, 0]  # Predicted similarity labels
accuracy = accuracy_score(y_true, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 80.00%


In [53]:
#output

from sklearn.metrics import precision_score

precision = precision_score(y_true, y_pred)
print(f"Precision: {precision:.2f}")


Precision: 1.00


In [54]:
lookalike_df.to_csv('Lookalike.csv', index=False)
print("Lookalike.csv file created successfully.")

Lookalike.csv file created successfully.


In [49]:
lookalike_df.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0181, 0.9826), (C0120, 0.9674), (C0184, 0.9..."
1,C0002,"[(C0088, 0.9946), (C0106, 0.9644), (C0134, 0.9..."
2,C0003,"[(C0031, 0.9525), (C0052, 0.9473), (C0195, 0.9..."
3,C0004,"[(C0165, 0.9639), (C0169, 0.9554), (C0087, 0.9..."
4,C0005,"[(C0140, 0.9954), (C0186, 0.9925), (C0146, 0.9..."
