In [9]:
%pip install pandas numpy scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets to form a comprehensive dataset
merged_data = transactions.merge(customers, on="CustomerID", how="left").merge(products, on="ProductID", how="left")

# Display the merged data structure
print("Merged Data:")
print(merged_data.head())

Merged Data:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Blue

In [16]:
# Aggregate transaction and customer-level features
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",  
    "Quantity": "sum",   
    "Price_x": "mean",      
    "Category": lambda x: x.mode()[0] if not x.mode().empty else "Unknown",  
    "Region": "first"     
}).reset_index()

customer_features = pd.get_dummies(customer_features, columns=["Category", "Region"], drop_first=True)

scaler = StandardScaler()
numerical_cols = ["TotalValue", "Quantity", "Price_x"]
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

print("Customer Features (after processing):")
print(customer_features.head())

Customer Features (after processing):
  CustomerID  TotalValue  Quantity   Price_x  Category_Clothing  \
0      C0001   -0.061701 -0.122033  0.094670              False   
1      C0002   -0.877744 -0.448000 -0.904016               True   
2      C0003   -0.405857  0.203934 -1.094109              False   
3      C0004    1.032547  1.670787 -0.447702              False   
4      C0005   -0.783929 -0.936951  0.285581              False   

   Category_Electronics  Category_Home Decor  Region_Europe  \
0                  True                False          False   
1                 False                False          False   
2                 False                 True          False   
3                 False                False          False   
4                  True                False          False   

   Region_North America  Region_South America  
0                 False                  True  
1                 False                 False  
2                 False             

In [17]:

similarity_matrix = cosine_similarity(customer_features.drop("CustomerID", axis=1))

similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

print("Similarity Matrix:")
print(similarity_df.head())

Similarity Matrix:
CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.009776  0.341298  0.215460  0.520779  0.540162   
C0002       0.009776  1.000000  0.407057 -0.332868  0.317156 -0.470541   
C0003       0.341298  0.407057  1.000000  0.340197 -0.062630 -0.055488   
C0004       0.215460 -0.332868  0.340197  1.000000 -0.693617  0.267270   
C0005       0.520779  0.317156 -0.062630 -0.693617  1.000000  0.016110   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.490787 -0.080280  0.086452 -0.025414  ...  0.690983  0.845277   
C0002      -0.083328 -0.117085  0.602212  0.851661  ...  0.112950  0.276806   
C0003      -0.367070  0.465148 -0.037727  0.500034  ...  0.433279  0.279602   
C0004      -0.580216  0.603312 -0.737951 -0.078958  ...  0.075942 -

In [18]:

target_customers = customers.loc[customers["CustomerID"].isin([f"C{str(i).zfill(4)}" for i in range(1, 21)]), "CustomerID"]

lookalike_map = {}
for customer in target_customers:
    similar_customers = similarity_df[customer].sort_values(ascending=False).iloc[1:4]  
    lookalike_map[customer] = list(zip(similar_customers.index, similar_customers.values))

lookalike_data = []
for cust_id, lookalikes in lookalike_map.items():
    for similar_cust, score in lookalikes:
        lookalike_data.append({"cust_id": cust_id, "similar_cust_id": similar_cust, "score": score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike Data:")
print(lookalike_df.head())

Lookalike Data:
  cust_id similar_cust_id     score
0   C0001           C0181  0.934652
1   C0001           C0120  0.897531
2   C0001           C0184  0.864073
3   C0002           C0088  0.984787
4   C0002           C0077  0.900746
