# Data Preparation

In [143]:
#### Importing library
import pandas as pd
import numpy as np
import plotly.express as px
import nbformat
import plotly.graph_objects as go
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch_geometric.nn import SAGEConv
from sklearn.model_selection import train_test_split


#### Loading Data
product_profile = pd.read_csv("product_profile.csv")
user_profile = pd.read_csv("user_profile.csv")
order = pd.read_csv("orders_clean.csv")

In [144]:
product_profile = product_profile.fillna(0)
product_profile

Unnamed: 0,id,category,price,title,vendor,average_rating,review_length,review_count
0,1,Gizmo,29.4633,Rustic Paper Wallet,"Swaniawski, Casper and Hilll",4.625000,172.750000,8.0
1,2,Doohickey,70.0799,Small Marble Shoes,Balistreri-Ankunding,0.000000,0.000000,0.0
2,3,Doohickey,35.3887,Synergistic Granite Chair,"Murray, Watsica and Wunsch",4.000000,171.000000,7.0
3,4,Doohickey,73.9918,Enormous Aluminum Shirt,Regan Bradtke and Sons,3.000000,167.600000,5.0
4,5,Gadget,82.7451,Enormous Marble Wallet,"Price, Schultz and Daniel",4.000000,146.750000,4.0
...,...,...,...,...,...,...,...,...
195,196,Widget,46.7641,Heavy-Duty Linen Toucan,Balistreri-Muller,0.000000,0.000000,0.0
196,197,Gizmo,46.7641,Aerodynamic Concrete Lamp,Erika Volkman Group,4.666667,162.833333,6.0
197,198,Gizmo,46.7641,Enormous Copper Shirt,"Considine, Schamberger and Schiller",4.142857,197.000000,7.0
198,199,Widget,76.9533,Mediocre Leather Coat,"Gulgowski, Grimes and Mayer",3.666667,185.333333,6.0


In [145]:
user_profile = user_profile.drop("Unnamed: 0", axis= 1)
user_profile = user_profile.fillna(0)
user_profile.head()

Unnamed: 0,id,name,state,latitude,longitude,source,dayduration,age,total_spent_Doohickey,total_spent_Gadget,total_spent_Gizmo,total_spent_Widget,total_orders,discount_usage_proportion,email_provider
0,1,Hudson Borer,NE,40.7132,-98.526,Twitter,2684,38,189.5193,389.5355,221.8629,1719.2326,11.0,0.272727,yahoo
1,2,Domenica Williamson,IA,41.5813,-92.6991,Affiliate,2500,57,0.0,0.0,0.0,0.0,0.0,0.0,yahoo
2,3,Lina Heaney,MN,46.1197,-92.8416,Facebook,2786,63,896.4755,126.91,695.0698,510.8554,10.0,0.2,yahoo
3,4,Arnold Adams,CO,37.9203,-104.973,Google,2182,32,149.891,0.0,214.7897,150.5928,4.0,0.25,gmail
4,5,Dominique Leffler,NY,42.349,-77.0567,Twitter,2716,50,0.0,0.0,332.208,0.0,1.0,0.0,hotmail


In [146]:
interactions = order[["user_id", "product_id"]]
interactions = interaction.sort_index()
interactions.head(20)

Unnamed: 0,user_id,product_id
0,1,14
1,1,123
2,1,105
3,1,94
4,1,132
5,1,60
6,1,55
7,1,65
8,1,184
9,1,6


### Creating Modeling Preprocessing Pipeline

In [147]:
#Dropping Unnecessary columns and records
user_profile = user_profile.drop(["longitude","state","name"], axis= 1)
user_profile = user_profile.loc[user_profile['total_orders'] != 0]
product_profile = product_profile.drop(["title","vendor"],axis= 1)

In [148]:
# Encode categorical features
label_encoders = {}
for col in ['source', 'email_provider']:
    le = LabelEncoder()
    user_profile[col] = le.fit_transform(user_profile[col].astype(str))
    label_encoders[col] = le

for col in ['category']:
    le = LabelEncoder()
    product_profile[col] = le.fit_transform(product_profile[col].astype(str))
    label_encoders[col] = le

In [149]:
# Standardize numerical features
scaler = StandardScaler()
numeric_cols_user = ['latitude', 'dayduration', 'age', 'total_spent_Doohickey',
                     'total_spent_Gadget', 'total_spent_Gizmo', 'total_spent_Widget', 'total_orders', 'discount_usage_proportion']
user_profile[numeric_cols_user] = scaler.fit_transform(user_profile[numeric_cols_user])

numeric_cols_product = ['price', 'average_rating', 'review_length', 'review_count']
product_profile[numeric_cols_product] = scaler.fit_transform(product_profile[numeric_cols_product])

In [150]:
product_profile.columns

Index(['id', 'category', 'price', 'average_rating', 'review_length',
       'review_count'],
      dtype='object')

In [151]:
user_profile.columns

Index(['id', 'latitude', 'source', 'dayduration', 'age',
       'total_spent_Doohickey', 'total_spent_Gadget', 'total_spent_Gizmo',
       'total_spent_Widget', 'total_orders', 'discount_usage_proportion',
       'email_provider'],
      dtype='object')

# Creating Neural Collaborative Filtering model

## Train Test Split

In [152]:
# Get last interaction for each user for test set
test_interactions = interactions.groupby('user_id').tail(1)
train_interactions = interactions.drop(test_interactions.index)

# Merge with user and product profiles
train_data = train_interactions.merge(user_profile, left_on='user_id', right_on='id')
train_data = train_data.merge(product_profile, left_on='product_id', right_on='id')
test_data = test_interactions.merge(user_profile, left_on='user_id', right_on='id')
test_data = test_data.merge(product_profile, left_on='product_id', right_on='id')

# Drop redundant ID columns
train_data.drop(columns=['id_x', 'id_y'], inplace=True)
test_data.drop(columns=['id_x', 'id_y'], inplace=True)

# Convert to numpy arrays
X_train = train_data.drop(columns=['user_id', 'product_id']).values
X_test = test_data.drop(columns=['user_id', 'product_id']).values

### Defining Model

In [153]:
def generate_target_matrix(interactions_df, user_map, product_map, num_users, num_products):
    """Generates a user-product interaction matrix with mapped indices."""
    interaction_matrix = np.zeros((num_users, num_products))

    for _, row in interactions_df.iterrows():
        user_idx = user_map.get(row['user_id'], -1)
        product_idx = product_map.get(row['product_id'], -1)

        if user_idx == -1 or product_idx == -1:
            continue  # Ignore out-of-bounds indices

        interaction_matrix[user_idx, product_idx] = 1  # Implicit feedback

    return interaction_matrix

# Create user and product index mappings
user_map = {uid: i for i, uid in enumerate(user_profile['id'].unique())}
product_map = {pid: i for i, pid in enumerate(product_profile['id'].unique())}

# Recalculate correct matrix size
num_users = len(user_map)
num_products = len(product_map)

print(num_users,num_products)

1746 200


##### Creating y_train and test

In [154]:
# Apply function with mappings
interaction_matrix = generate_target_matrix(train_interactions, user_map, product_map, num_users, num_products)
# Extract target labels (y_train) based on interactions
y_train = []
for _, row in train_interactions.iterrows():
    user_idx = user_map.get(row['user_id'], -1)
    product_idx = product_map.get(row['product_id'], -1)

    if user_idx == -1 or product_idx == -1:
        continue  # Skip invalid indices

    y_train.append(interaction_matrix[user_idx, product_idx])  # Extract correct label

y_train = np.array(y_train)  # Convert to NumPy array

y_test = []
for _, row in test_interactions.iterrows():
    user_idx = user_map.get(row['user_id'], -1)
    product_idx = product_map.get(row['product_id'], -1)

    if user_idx == -1 or product_idx == -1:
        continue  # Skip invalid indices

    y_test.append(interaction_matrix[user_idx, product_idx])  # Extract correct label

y_test = np.array(y_test)  # Convert to NumPy array

# Ensure that X_test and y_test have the same length
assert len(X_test) == len(y_test), f"Mismatch: X_test has {len(X_test)} rows, y_test has {len(y_test)} labels."

# Ensure that X_train and y_train have the same length
assert len(X_train) == len(y_train), f"Mismatch: X_train has {len(X_train)} rows, y_train has {len(y_train)} labels."

In [None]:
class InteractionDataset(Dataset):
    def __init__(self, X, y):
        assert len(X) == len(y), "Mismatch: X and y must have the same number of samples"
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_dataset = InteractionDataset(X_train, y_train)
test_dataset = InteractionDataset(X_test, y_test)
train_dataset = InteractionDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)

test_dataset = InteractionDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the neural network model
class CollaborativeFilteringNN(nn.Module):
    def __init__(self, input_dim):
        super(CollaborativeFilteringNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)  # Output interaction score
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Initialize model
input_dim = X_train.shape[1]
model = CollaborativeFilteringNN(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

TypeError: CollaborativeFilteringNN.__init__() missing 1 required positional argument: 'num_products'

### Training

In [None]:
for batch_X, batch_y in train_loader:
    print(f"Batch X shape: {batch_X.shape}, Batch y shape: {batch_y.shape}")
    break  # Just print one batch and stop

Batch X shape: torch.Size([32, 16]), Batch y shape: torch.Size([32])


In [None]:
print(f"X_train shape: {X_train.shape}")  # Should be (1746, feature_dim)
print(f"interaction_matrix shape: {interaction_matrix.shape}")  # Should be (1746,)

X_train shape: (17014, 16)
interaction_matrix shape: (1746, 200)


In [None]:
y_train = []
for _, row in train_interactions.iterrows():
    user_idx = user_map[row['user_id']]
    product_idx = product_map[row['product_id']]
    y_train.append(interaction_matrix[user_idx, product_idx])

y_train = np.array(y_train)  # Ensure it's a NumPy array
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

X_train shape: (17014, 16), y_train shape: (17014,)


In [None]:
# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X).squeeze()
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(train_loader)}")

train_model(model, train_loader, criterion, optimizer, epochs=10)



Epoch 1, Loss: 0.01582523186707827
Epoch 2, Loss: 4.2829155000723065e-05
Epoch 3, Loss: 1.2416876679219668e-05
Epoch 4, Loss: 5.466177877765903e-06
Epoch 5, Loss: 2.8939200182166337e-06
Epoch 6, Loss: 1.697057590522746e-06
Epoch 7, Loss: 1.0601690837224744e-06
Epoch 8, Loss: 6.909218070704574e-07
Epoch 9, Loss: 4.639464407539258e-07
Epoch 10, Loss: 3.1851412340298533e-07


In [None]:
# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            outputs = model(batch_X).squeeze()
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
    print(f"Test Loss: {total_loss / len(test_loader)}")

evaluate_model(model, test_loader)

Test Loss: 79.31648212779652


In [None]:
model.eval()  # Set model to evaluation mode

top_5_preds = []  # Store top 5 product predictions per user
all_user_ids = []  # Store corresponding user IDs

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        preds = model(batch_X)  # Get model predictions (shape: [batch_size, num_products])
        
        # Get top 5 product indices for each user in batch
        top_5_values, top_5_indices = torch.topk(preds, k=1, dim=1)  

        # Convert to list for easy storage
        top_5_preds.extend(top_5_indices.cpu().numpy().tolist())
        all_user_ids.extend(batch_y.cpu().numpy().tolist())  # Store user IDs

# Convert lists to DataFrame
recommendations_df = pd.DataFrame({
    "user_id": all_user_ids,  
    "top_5_product_ids": top_5_preds  # Each entry is a list of 5 product IDs
})

# Print first 10 users and their recommendations
recommendations_df

Unnamed: 0,user_id,top_5_product_ids
0,0.0,[0]
1,0.0,[0]
2,0.0,[0]
3,0.0,[0]
4,0.0,[0]
...,...,...
1741,0.0,[0]
1742,0.0,[0]
1743,0.0,[0]
1744,0.0,[0]
