In [1]:
%pip install pandas numpy scikit-learn

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.




[notice] A new release of pip is available: 24.2 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

merged_data = transactions.merge(customers, on="CustomerID", how="left").merge(products, on="ProductID", how="left")

print("Merged Data:")
print(merged_data.head())

Merged Data:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Blue

In [4]:
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",  
    "Quantity": "sum",    
    "Price_x": "mean",      
    "Category": lambda x: x.mode()[0] if not x.mode().empty else "Unknown",  
    "Region": "first"     
}).reset_index()

customer_features = pd.get_dummies(customer_features, columns=["Category", "Region"], drop_first=True)

scaler = StandardScaler()
numerical_cols = ["TotalValue", "Quantity", "Price_x"]
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])
print("Customer Features (after processing):")
print(customer_features.head())

KeyError: "['Price'] not in index"

In [None]:
similarity_matrix = cosine_similarity(customer_features.drop("CustomerID", axis=1))
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features["CustomerID"], columns=customer_features["CustomerID"])

print("Similarity Matrix:")
print(similarity_df.head())

In [None]:
target_customers = customers.loc[customers["CustomerID"].isin([f"C{str(i).zfill(4)}" for i in range(1, 21)]), "CustomerID"]

lookalike_map = {}
for customer in target_customers:
    similar_customers = similarity_df[customer].sort_values(ascending=False).iloc[1:4]  
    lookalike_map[customer] = list(zip(similar_customers.index, similar_customers.values))

lookalike_data = []
for cust_id, lookalikes in lookalike_map.items():
    for similar_cust, score in lookalikes:
        lookalike_data.append({"cust_id": cust_id, "similar_cust_id": similar_cust, "score": score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike Data:")
print(lookalike_df.head())