# Data Preparation for Modeling

In [21]:
#### Importing library
import pandas as pd
import numpy as np
import plotly.express as px
import nbformat
import plotly.graph_objects as go
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder
import math

#### Loading Data
product_profile = pd.read_csv("product_profile.csv")
user_profile = pd.read_csv("user_profile.csv")
order = pd.read_csv("orders_clean.csv")

In [22]:
product_profile = product_profile.fillna(0)
product_profile

Unnamed: 0,id,category,price,title,vendor,average_rating,review_length,review_count
0,1,Gizmo,29.4633,Rustic Paper Wallet,"Swaniawski, Casper and Hilll",4.625000,172.750000,8.0
1,2,Doohickey,70.0799,Small Marble Shoes,Balistreri-Ankunding,0.000000,0.000000,0.0
2,3,Doohickey,35.3887,Synergistic Granite Chair,"Murray, Watsica and Wunsch",4.000000,171.000000,7.0
3,4,Doohickey,73.9918,Enormous Aluminum Shirt,Regan Bradtke and Sons,3.000000,167.600000,5.0
4,5,Gadget,82.7451,Enormous Marble Wallet,"Price, Schultz and Daniel",4.000000,146.750000,4.0
...,...,...,...,...,...,...,...,...
195,196,Widget,46.7641,Heavy-Duty Linen Toucan,Balistreri-Muller,0.000000,0.000000,0.0
196,197,Gizmo,46.7641,Aerodynamic Concrete Lamp,Erika Volkman Group,4.666667,162.833333,6.0
197,198,Gizmo,46.7641,Enormous Copper Shirt,"Considine, Schamberger and Schiller",4.142857,197.000000,7.0
198,199,Widget,76.9533,Mediocre Leather Coat,"Gulgowski, Grimes and Mayer",3.666667,185.333333,6.0


In [23]:
user_profile = user_profile.drop("Unnamed: 0", axis= 1)
user_profile = user_profile.fillna(0)
user_profile.head()

Unnamed: 0,id,name,state,latitude,longitude,source,dayduration,age,total_spent_Doohickey,total_spent_Gadget,total_spent_Gizmo,total_spent_Widget,total_orders,discount_usage_proportion,email_provider
0,1,Hudson Borer,NE,40.7132,-98.526,Twitter,2684,38,189.5193,389.5355,221.8629,1719.2326,11.0,0.272727,yahoo
1,2,Domenica Williamson,IA,41.5813,-92.6991,Affiliate,2500,57,0.0,0.0,0.0,0.0,0.0,0.0,yahoo
2,3,Lina Heaney,MN,46.1197,-92.8416,Facebook,2786,63,896.4755,126.91,695.0698,510.8554,10.0,0.2,yahoo
3,4,Arnold Adams,CO,37.9203,-104.973,Google,2182,32,149.891,0.0,214.7897,150.5928,4.0,0.25,gmail
4,5,Dominique Leffler,NY,42.349,-77.0567,Twitter,2716,50,0.0,0.0,332.208,0.0,1.0,0.0,hotmail


In [24]:
interactions = order[["user_id", "product_id"]]
interactions = interactions.sort_index()
interactions = pd.read_csv("interaction.csv")
interactions.tail()

Unnamed: 0.1,Unnamed: 0,user_id,product_id
35756,,1861,136
35757,,1861,136
35758,,554,197
35759,,554,197
35760,,554,197


### Creating Modeling Preprocessing Pipeline

In [25]:
#Dropping Unnecessary columns and records
user_profile = user_profile.drop(["longitude","state","name"], axis= 1)
user_profile = user_profile.loc[user_profile['total_orders'] != 0]
product_profile = product_profile.drop(["title","vendor"],axis= 1)

In [26]:
user_profile = pd.get_dummies(user_profile, columns=['source', 'email_provider'])
product_profile = pd.get_dummies(product_profile, columns=['category'])


In [27]:
# Standardize numerical features
scaler = StandardScaler()
numeric_cols_user = ['latitude', 'dayduration', 'age', 'total_spent_Doohickey',
                     'total_spent_Gadget', 'total_spent_Gizmo', 'total_spent_Widget', 'total_orders', 'discount_usage_proportion']
user_profile[numeric_cols_user] = scaler.fit_transform(user_profile[numeric_cols_user])

numeric_cols_product = ['price', 'average_rating', 'review_length', 'review_count']
product_profile[numeric_cols_product] = scaler.fit_transform(product_profile[numeric_cols_product])

In [28]:
product_profile.columns

Index(['id', 'price', 'average_rating', 'review_length', 'review_count',
       'category_Doohickey', 'category_Gadget', 'category_Gizmo',
       'category_Widget'],
      dtype='object')

In [29]:
user_profile.columns

Index(['id', 'latitude', 'dayduration', 'age', 'total_spent_Doohickey',
       'total_spent_Gadget', 'total_spent_Gizmo', 'total_spent_Widget',
       'total_orders', 'discount_usage_proportion', 'source_Affiliate',
       'source_Facebook', 'source_Google', 'source_Organic', 'source_Twitter',
       'email_provider_gmail', 'email_provider_hotmail',
       'email_provider_yahoo'],
      dtype='object')

# Creating Neural Collaborative Filtering model

## Modeling Process

In [30]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ==========================
# 1. Data Preparation
# ==========================

# For reproducibility:
torch.manual_seed(42)
np.random.seed(42)

# ---- Split interactions into train and test (backtesting) ----
# For each user, the last interaction (by the ordering of interactions) is held out as test.
# Count the number of interactions for each user
user_interaction_count = interactions.groupby('user_id').size()

# Filter users who have more than 1 interaction
valid_users = user_interaction_count[user_interaction_count > 1].index

# Select only the interactions of valid users
valid_interactions = interactions[interactions['user_id'].isin(valid_users)]

# Get the last interaction for each user
test_interactions = valid_interactions.groupby('user_id').tail(2)

# Get the rest as the training set
train_interactions = valid_interactions.drop(test_interactions.index)

# ---- Create mapping dictionaries for users and products ----
user_ids = user_profile['id'].unique()
product_ids = product_profile['id'].unique()
user_map = {uid: i for i, uid in enumerate(user_ids)}
product_map = {pid: i for i, pid in enumerate(product_ids)}

# ---- Prepare features for users and products ----
# Choose which columns to use.
# (For the categorical features you might want to apply encoding; here we assume they are already numeric.)
user_feature_cols = ['latitude', 'dayduration', 'age', 'total_spent_Doohickey',
       'total_spent_Gadget', 'total_spent_Gizmo', 'total_spent_Widget',
       'total_orders', 'discount_usage_proportion', 'source_Affiliate',
       'source_Facebook', 'source_Google', 'source_Organic', 'source_Twitter',
       'email_provider_gmail', 'email_provider_hotmail',
       'email_provider_yahoo']

product_feature_cols = ['price', 'average_rating', 'review_length', 'review_count',
       'category_Doohickey', 'category_Gadget', 'category_Gizmo',
       'category_Widget']

# Set indices for fast lookup
user_profile_indexed = user_profile.set_index('id')
product_profile_indexed = product_profile.set_index('id')

def get_user_features(uid):
    """Return the feature vector for a given user id as a NumPy array."""
    return user_profile_indexed.loc[uid, user_feature_cols].values.astype(np.float32)

def get_product_features(pid):
    """Return the feature vector for a given product id as a NumPy array."""
    return product_profile_indexed.loc[pid, product_feature_cols].values.astype(np.float32)

# Build a product features matrix for all products (ordered by product_map)
num_products = len(product_map)
product_features_matrix = np.zeros((num_products, len(product_feature_cols)), dtype=np.float32)
for pid, idx in product_map.items():
    product_features_matrix[idx] = get_product_features(pid)
# Convert to a PyTorch tensor (this tensor is fixed and will be used to generate product embeddings)
product_features_tensor = torch.tensor(product_features_matrix)

# ==========================
# 2. Create PyTorch Dataset
# ==========================

class RecommenderDataset(Dataset):
    def __init__(self, interactions_df):
        # Reset index so that we can iterate row-by-row
        self.interactions_df = interactions_df.reset_index(drop=True)
        
    def __len__(self):
        return len(self.interactions_df)
    
    def __getitem__(self, idx):
        row = self.interactions_df.iloc[idx]
        uid = row['user_id']
        pid = row['product_id']
        user_feat = get_user_features(uid)
        # The label is the product index (for multiclass classification)
        label = product_map[pid]
        return torch.tensor(user_feat), torch.tensor(label).long()
        
# Create training and testing datasets and loaders
train_dataset = RecommenderDataset(train_interactions)
test_dataset = RecommenderDataset(test_interactions)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# ==========================
# 3. Define the Two-Tower Model
# ==========================

class RecommenderModel(nn.Module):
    def __init__(self, user_input_dim, product_input_dim, embedding_dim, product_features_tensor):
        """
        Args:
            user_input_dim: Dimension of the user feature vector.
            product_input_dim: Dimension of the product feature vector.
            embedding_dim: Size of the common embedding space.
            product_features_tensor: A tensor of shape (num_products, product_input_dim)
                                     containing the features for each product.
        """
        super(RecommenderModel, self).__init__()
        # --- User Tower ---
        self.user_net = nn.Sequential(
            nn.Linear(user_input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, embedding_dim)
        )
        # --- Product Tower ---
        self.product_net = nn.Sequential(
            nn.Linear(product_input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, embedding_dim)
        )
        # Register the product features tensor as a buffer so it is part of the graph
        self.register_buffer('product_features', product_features_tensor)
        
    def forward(self, user_features):
        """
        Given a batch of user features (shape: [batch_size, user_input_dim]),
        compute the user embedding and then score all products via dot-product.
        Returns:
            logits: Tensor of shape [batch_size, num_products] (raw scores).
        """
        # Compute user embedding for the batch.
        user_emb = self.user_net(user_features)  # shape: (batch, embedding_dim)
        # Compute product embeddings for all products (using the fixed product features).
        product_emb = self.product_net(self.product_features)  # shape: (num_products, embedding_dim)
        # Compute dot-product between each user and all products.
        logits = torch.matmul(user_emb, product_emb.t())  # shape: (batch, num_products)
        # (Optionally, you could apply softmax here to get probabilities, but for training with CrossEntropyLoss it’s not needed.)
        return logits

# Set hyperparameters
user_input_dim = len(user_feature_cols)
product_input_dim = len(product_feature_cols)
embedding_dim = 32  # You can tune this

model = RecommenderModel(user_input_dim, product_input_dim, embedding_dim, product_features_tensor)

# ==========================
# 4. Training Setup
# ==========================

criterion = nn.CrossEntropyLoss()  # This loss expects raw logits.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

def train_model(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for batch_user, batch_label in train_loader:
            optimizer.zero_grad()
            logits = model(batch_user)  # shape: (batch, num_products)
            loss = criterion(logits, batch_label)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(train_loader):.4f}")

train_model(model, train_loader, criterion, optimizer, epochs=40)



Epoch 1, Loss: 4.0211
Epoch 2, Loss: 4.1048
Epoch 3, Loss: 4.0195
Epoch 4, Loss: 4.0750
Epoch 5, Loss: 4.0829
Epoch 6, Loss: 4.1209
Epoch 7, Loss: 4.0896
Epoch 8, Loss: 4.0796
Epoch 9, Loss: 4.0721
Epoch 10, Loss: 4.0801
Epoch 11, Loss: 4.1072
Epoch 12, Loss: 4.1324
Epoch 13, Loss: 4.0946
Epoch 14, Loss: 4.0708
Epoch 15, Loss: 4.0855
Epoch 16, Loss: 4.0767
Epoch 17, Loss: 4.0504
Epoch 18, Loss: 4.0175
Epoch 19, Loss: 4.0404
Epoch 20, Loss: 4.0291
Epoch 21, Loss: 4.0615
Epoch 22, Loss: 4.0615
Epoch 23, Loss: 4.0624
Epoch 24, Loss: 4.0452
Epoch 25, Loss: 4.0564
Epoch 26, Loss: 4.0822
Epoch 27, Loss: 4.0557
Epoch 28, Loss: 4.0451
Epoch 29, Loss: 4.0358
Epoch 30, Loss: 4.0368
Epoch 31, Loss: 4.0394
Epoch 32, Loss: 4.0412
Epoch 33, Loss: 4.0469
Epoch 34, Loss: 4.0337
Epoch 35, Loss: 4.0283
Epoch 36, Loss: 4.0432
Epoch 37, Loss: 4.0405
Epoch 38, Loss: 4.0325
Epoch 39, Loss: 4.0541
Epoch 40, Loss: 4.0372


## Model Evaluation

### Evaluation on Test Set


In [31]:
def hit_rate_at_k(model, test_loader, k=5):
    """
    Computes the Hit Rate@K metric over the test set.
    For each test example, if the true product label is among the top-K predicted products, 
    it counts as a hit.
    """
    model.eval()
    hits = 0
    total = 0
    with torch.no_grad():
        for batch_user, batch_label in test_loader:
            logits = model(batch_user)  # (batch, num_products)
            # (Optional) If you need probabilities, you can do:
            # probs = torch.softmax(logits, dim=1)
            # Get indices of the top K predictions.
            topk = torch.topk(logits, k, dim=1).indices  # shape: (batch, k)
            for i in range(len(batch_label)):
                total += 1
                if batch_label[i] in topk[i]:
                    hits += 1
    return hits / total if total > 0 else 0

hit_rate = hit_rate_at_k(model, test_loader, k=100)
print(f"Hit Rate @ 5: {hit_rate:.4f}")

Hit Rate @ 5: 0.5382


### Evaluate on Train Set

In [32]:
def hit_rate_at_k_train(model, train_loader, k=5):
    """
    Computes the Hit Rate@K metric over the training set.
    For each train example, if the true product label is among the top-K predicted products, 
    it counts as a hit.
    """
    model.eval()
    hits = 0
    total = 0
    with torch.no_grad():
        for batch_user, batch_label in train_loader:
            logits = model(batch_user)  # shape: (batch, num_products)
            # (Optional) If you need probabilities, you can do:
            # probs = torch.softmax(logits, dim=1)
            # Get indices of the top K predictions.
            topk = torch.topk(logits, k, dim=1).indices  # shape: (batch, k)
            for i in range(len(batch_label)):
                total += 1
                if batch_label[i] in topk[i]:
                    hits += 1
    return hits / total if total > 0 else 0

# Compute Hit Rate @ 5 for the training set
hit_rate_train = hit_rate_at_k_train(model, train_loader, k=100)
print(f"Hit Rate @ 5 (Training Set): {hit_rate_train:.4f}")

Hit Rate @ 5 (Training Set): 0.7589
