# Step 1. Setup and Path Definitions
- This section imports the necessary libraries and defines absolute paths to your project's data and model directories. This makes the script easier to manage and run on different systems.

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pickle
import csv
from google.colab import drive

# --- Mount Google Drive ---
# This is necessary to access files stored in your Google Drive from Colab.
drive.mount('/content/drive')

# --- Define Absolute Paths ---
# Establishes a base directory to make file paths portable.
# Assumes the notebook is running from the root of the 'Amazon_Recommender' project.
BASE_DIR = '/content/drive/MyDrive/Amazon_Recommender' # Using the common base path from the notebook

# Define paths to key directories.
DATA_DIR = os.path.join(BASE_DIR, 'data')
MODELS_DIR = os.path.join(BASE_DIR, 'models')
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, 'processed')
RAW_DATA_DIR = os.path.join(DATA_DIR, 'raw')

# Define specific file paths for data, model, and mappings.
RAW_DATA_FILE = os.path.join(RAW_DATA_DIR, 'Electronics_5.csv')
SAMPLED_DATA_FILE = os.path.join(PROCESSED_DATA_DIR, '11_sampled_10_percent_raw_data.csv')
MODEL_SAVE_PATH = os.path.join(MODELS_DIR, '11_ncf_model.pt')
MAPPINGS_SAVE_PATH = os.path.join(MODELS_DIR, '11_ncf_mappings.pkl')
EVALUATION_SAVE_PATH = os.path.join(PROCESSED_DATA_DIR, '11_ncf_evaluation_metrics.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Step 2. Data Loading and Preparation
- We load the raw dataset, convert user and item IDs into unique integer indices (a required format for embedding layers), and split the data into training and testing sets. A smaller 10% sample is also saved for quicker testing if needed.

In [None]:
# --- Load and Prepare Data ---
try:
    # Load the full dataset of electronics reviews.
    ratings_df = pd.read_csv(RAW_DATA_FILE)
    print(f"Full dataset loaded. Shape: {ratings_df.shape}")

    # For faster experimentation, save a 10% sample of the data.
    # This step can be commented out if not needed.
    sampled_df = ratings_df.head(len(ratings_df) // 10)
    sampled_df.to_csv(SAMPLED_DATA_FILE, index=False)
    print(f"Saved a 10% sample for quick testing to: {SAMPLED_DATA_FILE}")

    # --- Feature Engineering: Create User and Item Indices ---
    # Neural networks require numerical inputs. We convert string IDs to categorical integer codes.
    ratings_df['user_idx'] = ratings_df['reviewerID'].astype("category").cat.codes
    ratings_df['item_idx'] = ratings_df['asin'].astype("category").cat.codes
    print("Created user and item integer indices.")

    # --- Split Data into Training and Testing Sets ---
    train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)
    print(f"Data split into training set ({len(train_df)} rows) and test set ({len(test_df)} rows).")

except FileNotFoundError:
    print(f"Error: Raw data file not found at {RAW_DATA_FILE}")
    ratings_df = None

  ratings_df = pd.read_csv(RAW_DATA_FILE)


Full dataset loaded. Shape: (6739590, 12)
Saved a 10% sample for quick testing to: /content/drive/MyDrive/Amazon_Recommender/data/processed/11_sampled_10_percent_raw_data.csv
Created user and item integer indices.
Data split into training set (5391672 rows) and test set (1347918 rows).


# Step 3. PyTorch Dataset and DataLoader
- A custom RatingsDataset class is defined to handle the data format required by PyTorch. DataLoader is then used to efficiently batch and shuffle the data during training.

In [None]:
# --- Create a PyTorch Dataset ---
class RatingsDataset(Dataset):
    """Custom PyTorch Dataset for user-item ratings."""
    def __init__(self, df):
        self.users = torch.tensor(df['user_idx'].values, dtype=torch.long)
        self.items = torch.tensor(df['item_idx'].values, dtype=torch.long)
        self.labels = torch.tensor(df['overall'].values, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

if 'train_df' in locals():
    # --- Create Datasets and DataLoaders ---
    train_dataset = RatingsDataset(train_df)
    test_dataset = RatingsDataset(test_df)

    # DataLoader provides an iterator for easy batching, shuffling, and parallel loading.
    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)
    print("\nPyTorch Datasets and DataLoaders created.")


PyTorch Datasets and DataLoaders created.


# Step 4. Neural Collaborative Filtering (NCF) Model Definition
- This class defines the architecture of the NCF model, which includes embedding layers for users and items, followed by several fully connected (linear) layers to predict ratings.

In [None]:
# --- Define the NCF Model ---
class NCF(nn.Module):
    """Neural Collaborative Filtering (NCF) model."""
    def __init__(self, num_users, num_items, embedding_dim=64):
        super(NCF, self).__init__()
        # Embedding layer for users: maps each user_idx to a dense vector.
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        # Embedding layer for items: maps each item_idx to a dense vector.
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        # A sequence of fully connected layers to learn the interaction between user and item embeddings.
        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Final output layer for the predicted rating.
        )

    def forward(self, user_idx, item_idx):
        # Get the embedding vectors for the user and item.
        user_embedding_vector = self.user_embedding(user_idx)
        item_embedding_vector = self.item_embedding(item_idx)
        # Concatenate the two vectors to form a single feature vector.
        concatenated_vector = torch.cat([user_embedding_vector, item_embedding_vector], dim=-1)
        # Pass the concatenated vector through the fully connected layers and remove the last dimension.
        return self.fc_layers(concatenated_vector).squeeze()

# Step 5. Model Training
- The NCF model is instantiated, and the training loop is executed for a defined number of epochs. The Mean Squared Error (MSE) loss is minimized using the Adam optimizer.

In [None]:
if 'ratings_df' in locals():
    # --- Initialize Model, Loss, and Optimizer ---
    # Set the device to GPU ('cuda') if available, otherwise use CPU.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"\nUsing device: {device}")

    # Instantiate the NCF model.
    ncf_model = NCF(
        num_users=ratings_df['user_idx'].nunique(),
        num_items=ratings_df['item_idx'].nunique(),
        embedding_dim=64
    ).to(device)

    # Define the loss function (Mean Squared Error for regression).
    loss_function = nn.MSELoss()
    # Define the optimizer (Adam is a popular choice for deep learning).
    optimizer = torch.optim.Adam(ncf_model.parameters(), lr=0.001)

    # --- Training Loop ---
    EPOCHS = 5
    print(f"Starting training for {EPOCHS} epochs...")

    for epoch in range(EPOCHS):
        ncf_model.train()  # Set the model to training mode.
        total_loss = 0
        for user_indices, item_indices, labels in train_loader:
            # Move data batch to the selected device (GPU or CPU).
            user_indices, item_indices, labels = user_indices.to(device), item_indices.to(device), labels.to(device)

            # --- Forward Pass ---
            optimizer.zero_grad()  # Reset gradients.
            outputs = ncf_model(user_indices, item_indices)
            loss = loss_function(outputs, labels)

            # --- Backward Pass and Optimization ---
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{EPOCHS}, Average Training Loss: {avg_loss:.4f}")


Using device: cuda
Starting training for 5 epochs...
Epoch 1/5, Average Training Loss: 1.4183
Epoch 2/5, Average Training Loss: 1.2193
Epoch 3/5, Average Training Loss: 1.0812
Epoch 4/5, Average Training Loss: 0.9884
Epoch 5/5, Average Training Loss: 0.9162


# Step 6. Model Evaluation
- After training, the model's performance is evaluated on the unseen test data using the Root Mean Squared Error (RMSE) metric.

In [None]:
if 'ncf_model' in locals():
    # --- Evaluate the Model on the Test Set ---
    ncf_model.eval()  # Set the model to evaluation mode (disables dropout, etc.).
    true_labels = []
    predictions = []

    with torch.no_grad():  # Disable gradient calculation for efficiency.
        for user_indices, item_indices, labels in test_loader:
            user_indices, item_indices = user_indices.to(device), item_indices.to(device)

            # Get model predictions and move them to the CPU for NumPy conversion.
            outputs = ncf_model(user_indices, item_indices).cpu().numpy()
            predictions.extend(outputs)
            true_labels.extend(labels.numpy())

    # Calculate Root Mean Squared Error (RMSE).
    rmse = np.sqrt(mean_squared_error(true_labels, predictions))
    print(f"\nTest RMSE: {rmse:.4f}")


Test RMSE: 1.1303


# Step 7. Saving Artifacts
- The trained model state, evaluation results, and the crucial user_id/asin to integer index mappings are saved for future use in prediction tasks.

In [None]:
if 'ncf_model' in locals():
    # --- Save the Trained Model State Dictionary ---
    torch.save(ncf_model.state_dict(), MODEL_SAVE_PATH)
    print(f"Model state dictionary saved to: {MODEL_SAVE_PATH}")

    # --- Save the Evaluation Metric ---
    with open(EVALUATION_SAVE_PATH, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Metric', 'Value'])
        writer.writerow(['RMSE', rmse])
    print(f"Evaluation metrics saved to: {EVALUATION_SAVE_PATH}")

    # --- Create and Save User/Item ID Mappings ---
    # These mappings are essential to convert raw IDs to model indices during inference.
    user_id_to_idx = {uid: idx for idx, uid in enumerate(ratings_df['reviewerID'].astype("category").cat.categories)}
    item_id_to_idx = {asin: idx for idx, asin in enumerate(ratings_df['asin'].astype("category").cat.categories)}

    with open(MAPPINGS_SAVE_PATH, 'wb') as f:
        pickle.dump((user_id_to_idx, item_id_to_idx), f)
    print(f"User and item ID mappings saved to: {MAPPINGS_SAVE_PATH}")

Model state dictionary saved to: /content/drive/MyDrive/Amazon_Recommender/models/11_ncf_model.pt
Evaluation metrics saved to: /content/drive/MyDrive/Amazon_Recommender/data/processed/11_ncf_evaluation_metrics.csv
User and item ID mappings saved to: /content/drive/MyDrive/Amazon_Recommender/models/11_ncf_mappings.pkl


# Previous Version w/o Annotation

 ### Step 1: Load and Prepare the Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Amazon_Recommender/data/raw/Electronics_5.csv')

  df = pd.read_csv('/content/drive/MyDrive/Amazon_Recommender/data/raw/Electronics_5.csv')


In [None]:
df.head(len(df) // 10).to_csv('/content/drive/MyDrive/Amazon_Recommender/data/processed/10_%_raw_data.csv', index=False)

In [None]:
# Why? Neural networks need numbers, not strings.
df['user_idx'] = df['reviewerID'].astype("category").cat.codes
df['item_idx'] = df['asin'].astype("category").cat.codes

train, test = train_test_split(df, test_size = 0.2, random_state = 42)

In [None]:
df['user_idx'].values

### Step 2: Create a PyTorch Dataset

In [None]:
class RatingsDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_idx'].values, dtype = torch.long)
        self.items = torch.tensor(df['item_idx'].values, dtype=torch.long)
        self.labels = torch.tensor(df['overall'].values, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

train_dataset = RatingsDataset(train)
test_dataset = RatingsDataset(test)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256)

### Step3: Define the Neural Network

In [None]:
import torch.nn as nn

class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # output predicted rating
        )

    def forward(self, user_idx, item_idx):
        user_emb = self.user_embedding(user_idx)
        item_emb = self.item_embedding(item_idx)
        x = torch.cat([user_emb, item_emb], dim=-1)
        return self.fc_layers(x).squeeze()


### Step 4: Train the Model

In [None]:
model = NCF(num_users=df['user_idx'].nunique(), num_items=df['item_idx'].nunique())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for user, item, label in train_loader:
        user, item, label = user.to(device), item.to(device), label.to(device)
        optimizer.zero_grad()
        output = model(user, item)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 1/5, Loss: 1.4126
Epoch 2/5, Loss: 1.2185
Epoch 3/5, Loss: 1.0703
Epoch 4/5, Loss: 0.9714
Epoch 5/5, Loss: 0.9002


### Step 5: Evaluate NCF

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

model.eval()
true_labels = []
predictions = []

with torch.no_grad():
    for user, item, label in test_loader:
        user, item = user.to(device), item.to(device)
        output = model(user, item).cpu().numpy()
        predictions.extend(output)
        true_labels.extend(label.numpy())

rmse = np.sqrt(mean_squared_error(true_labels, predictions))
print(f"Test RMSE: {rmse:.4f}")


Test RMSE: 1.1306


### Step 6: Save the Model & Results

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/Amazon_Recommender/models/ncf_model.pt')

import csv
with open('/content/drive/MyDrive/Amazon_Recommender/data/processed/ncf_eval.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['RMSE'])
    writer.writerow([rmse])


In [None]:
import pandas as pd
import pickle

# Load raw data again
df = pd.read_csv('/content/drive/MyDrive/Amazon_Recommender/data/raw/Electronics_5.csv')

# Recreate user/item mappings
unique_users = df['reviewerID'].unique()
unique_items = df['asin'].unique()

user2idx = {uid: idx for idx, uid in enumerate(unique_users)}
item2idx = {asin: idx for idx, asin in enumerate(unique_items)}

# Save the mappings for Day 12
with open('/content/drive/MyDrive/Amazon_Recommender/models/user_item_mappings.pkl', 'wb') as f:
    pickle.dump((user2idx, item2idx), f)

  df = pd.read_csv('/content/drive/MyDrive/Amazon_Recommender/data/raw/Electronics_5.csv')


### Step 7: Make Function

In [None]:
class RatingsDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_idx'].values, dtype = torch.long)
        self.items = torch.tensor(df['item_idx'].values, dtype=torch.long)

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx]

import torch.nn as nn

class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # output predicted rating
        )

    def forward(self, user_idx, item_idx):
        user_emb = self.user_embedding(user_idx)
        item_emb = self.item_embedding(item_idx)
        x = torch.cat([user_emb, item_emb], dim=-1)
        return self.fc_layers(x).squeeze()

def cal_ncf(model_path, mapping_path, df, embedding_dim=64, batch_size=256):

    def load_model_and_mappings(model_path, mapping_path, embedding_dim=64):

        with open(mapping_path, 'rb') as f:
            user2idx, item2idx = pickle.load(f)

        model = NCF(num_users=len(user2idx), num_items=len(item2idx), embedding_dim=embedding_dim)
        model.load_state_dict(torch.load(model_path))
        model.eval()

        return model, user2idx, item2idx


    def predict_rating(model, user2idx, item2idx, user_id, asin):
        if user_id not in user2idx:
            raise ValueError(f"User {user_id} is not in mappings")
        if asin not in item2idx:
            raise ValueError(f"Item {asin} is not in mappings")

        user_idx = torch.tensor([user2idx[user_id]], dtype=torch.long)
        item_idx = torch.tensor([item2idx[asin]], dtype=torch.long)

        with torch.no_grad():
            pred = model(user_idx, item_idx)

        return pred.item()


    def predict_ratings_batch(model, user2idx, item2idx, df):
        valid_rows = df[df['user_id'].isin(user2idx) & df['item_id'].isin(item2idx)].copy()

        users_tensor = torch.tensor([user2idx[u] for u in valid_rows['user_id']], dtype=torch.long)
        items_tensor = torch.tensor([item2idx[i] for i in valid_rows['item_id']], dtype=torch.long)

        with torch.no_grad():
            preds = model(users_tensor, items_tensor).numpy()

        valid_rows['NCF'] = preds
        return valid_rows

    model, user2idx, item2idx = load_model_and_mappings(model_path, mapping_path, embedding_dim)
    return predict_ratings_batch(model, user2idx, item2idx, df)

In [None]:
import pickle
df_test = df.head(1000).copy().rename(columns={'reviewerID': 'user_id', 'asin': 'item_id'})
# model, user2idx, item2idx = load_model_and_mappings('/content/drive/MyDrive/Amazon_Recommender/models/11_ncf_model.pt', '/content/drive/MyDrive/Amazon_Recommender/models/11_user_item_mappings.pkl')

cal_ncf('/content/drive/MyDrive/Amazon_Recommender/models/11_ncf_model.pt', '/content/drive/MyDrive/Amazon_Recommender/models/11_user_item_mappings.pkl', df_test[['user_id', 'item_id', 'overall']].sample(5))

Unnamed: 0,user_id,item_id,overall
561,A3TG05J8EU1EDF,0972683275,5.0
304,A3B2DFMVEIT8TS,0789743035,3.0
612,A22UL4BXTSXSCB,0972683275,4.0
188,A11W8EXJ8PGCEM,059449771X,5.0
854,A3AQM5GA7LVDXG,106171327X,2.0


Unnamed: 0,user_id,item_id,overall,NCF
561,A3TG05J8EU1EDF,0972683275,5.0,2.791043
304,A3B2DFMVEIT8TS,0789743035,3.0,4.599814
612,A22UL4BXTSXSCB,0972683275,4.0,4.562771
188,A11W8EXJ8PGCEM,059449771X,5.0,4.798499
854,A3AQM5GA7LVDXG,106171327X,2.0,4.665658
