In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# -----------------------
# Hyperparameters
# -----------------------
EMBED_DIM = 64
BATCH_SIZE = 64
LR = 3.591388314145999e-05
EPOCHS = 100
REG_LAMBDA = 1e-1
PATIENCE = 14
# DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu") 
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# set seeds for reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE.type == "cuda":
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True


Using device: cuda


In [2]:
# -----------------------
# Data Loading
# -----------------------
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv") 

unique_users = train_df.user_id.unique()
unique_books = train_df.book_id.unique()

user2idx = {u: i for i, u in enumerate(unique_users)}
book2idx = {b: i for i, b in enumerate(unique_books)}

train_df["user_idx"] = train_df["user_id"].map(user2idx)
train_df["book_idx"] = train_df["book_id"].map(book2idx)

n_users = len(user2idx)
n_books = len(book2idx)

train_df['user_idx'] = train_df['user_id'].map(user2idx)
train_df['book_idx'] = train_df['book_id'].map(book2idx)

In [3]:
# Split data
user_counts = train_df['user_id'].value_counts()
rare_threshold = 4
rare_users = user_counts[user_counts <= rare_threshold].index
non_rare_users = user_counts[user_counts > rare_threshold].index
rare_df = train_df[train_df['user_id'].isin(rare_users)]
non_rare_df = train_df[train_df['user_id'].isin(non_rare_users)]
# currently unnecessary but if we want to stratify we have to do it this way
train_non_rare, val_non_rare = train_test_split(non_rare_df, test_size=0.05, random_state=42)
rare_train, rare_val = train_test_split(rare_df, test_size=0.05, random_state=42)

train_data = pd.concat([train_non_rare, rare_train]).sample(frac=1, random_state=42).reset_index(drop=True)
val_data = pd.concat([val_non_rare, rare_val]).sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
class RatingsDataset(Dataset):
    def __init__(self, df):
        self.users = df["user_idx"].values
        self.books = df["book_idx"].values
        self.ratings = df["rating"].values.astype(np.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.users[idx], dtype=torch.long),
            torch.tensor(self.books[idx], dtype=torch.long),
            torch.tensor(self.ratings[idx], dtype=torch.float32),
        )

In [5]:
# -----------------------
# Define Fixed Offset based on User Counts
# -----------------------
user_counts = (
    train_data.groupby("user_idx").size().reindex(range(n_users), fill_value=0).values
)
max_count = user_counts.max()
# normalize count: f_u in [0,1]
normalized_counts = np.log1p(user_counts) / np.log1p(max_count)

# Define fixed offset: from ~3.66 at f_u=0 to ~2.10 at f_u=1
offset_high = 3.66
offset_low = 2.10
# offset(u) = offset_high - (offset_high - offset_low)*f_u
# = offset_high + (offset_low - offset_high)*f_u
offsets = offset_high + (offset_low - offset_high) * normalized_counts
offsets_tensor = torch.tensor(offsets, dtype=torch.float32, device=DEVICE)
# TODO I will get the offsets directly based on the avg rating of the users with the same number of reviews

In [6]:
user_id_counts = train_data.groupby("user_id")["user_id"].transform("size")
count2mean = train_data.groupby(user_id_counts)["rating"].mean()
count2mean_dict = count2mean.to_dict()
# count2mean_dict

In [7]:
# Define old offset high/low
offset_high = 3.79
offset_low = 2.10

# Compute the mean rating by exact user count
user_id_counts = train_data.groupby("user_id")["user_id"].transform("size")
count2mean = train_data.groupby(user_id_counts)["rating"].mean()
count2mean_dict = count2mean.to_dict()

# Compute offsets now with lookup and fallback
offsets = np.zeros_like(user_counts, dtype=np.float32)
for i, c in enumerate(user_counts):
    if c in count2mean_dict:
        # Use the mean for that count minus global mean as offset
        offsets[i] = count2mean_dict[c]
    else:
        # Fallback to old linear approximation based on normalized counts
        f_u = np.log1p(c) / (np.log1p(max_count) if max_count > 0 else 1.0)
        offsets[i] = offset_high + (offset_low - offset_high) * f_u

offsets_tensor = torch.tensor(offsets, dtype=torch.float32, device=DEVICE)

In [8]:
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, emb_dim=64):
        super(MatrixFactorization, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_dim)
        self.item_emb = nn.Embedding(num_items, emb_dim)

        # initialize biases 
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        self.global_bias = nn.Parameter(torch.zeros(1))

        self.mlp1 = nn.Linear(emb_dim, emb_dim)
        self.mlp2 = nn.Linear(emb_dim, emb_dim)

        nn.init.xavier_uniform_(self.user_emb.weight)
        nn.init.xavier_uniform_(self.item_emb.weight)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)

    def forward(self, user_ids, item_ids):
        U = self.user_emb(user_ids)
        V = self.item_emb(item_ids)
        U = self.mlp1(U) + U  # add skip connection
        V = self.mlp2(V) + V  # add skip connection
        u_b = self.user_bias(user_ids).squeeze()
        i_b = self.item_bias(item_ids).squeeze()

        pred = (U * V).sum(dim=1) + u_b + i_b + self.global_bias
        return pred
# loss function with regularization 
def loss_fn(pred, target, model, reg_lambda):
    base_loss = nn.MSELoss()(pred, target)
    user_reg = model.user_emb.weight.norm(2)
    item_reg = model.item_emb.weight.norm(2)
    return base_loss + reg_lambda * (user_reg + item_reg)

In [9]:
def train_model(model, train_loader, val_loader, loss_fn, optimizer, scheduler, epochs, patience, save_path):
    best_val_loss = float("inf")
    epochs_no_improve = 0

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        train_squared_error = 0
        num_train_samples = 0

        for users, items, ratings in train_loader:
            users = users.to(DEVICE)
            items = items.to(DEVICE)
            ratings = ratings.to(DEVICE)
            
            optimizer.zero_grad()
            preds = model(users, items)
            loss = loss_fn(preds, ratings, model, REG_LAMBDA)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

            train_squared_error += ((preds - ratings) ** 2).sum().item()
            num_train_samples += ratings.size(0)

        avg_train_loss = total_train_loss / len(train_loader)
        train_rmse = np.sqrt(train_squared_error / num_train_samples)

        # Validation
        model.eval()
        total_val_loss = 0
        val_squared_error = 0
        num_val_samples = 0

        with torch.no_grad():
            for users, items, ratings in val_loader:
                users = users.to(DEVICE)
                items = items.to(DEVICE)
                ratings = ratings.to(DEVICE)

                preds = model(users, items)
                val_loss = loss_fn(preds, ratings, model, 0)  # no regularization in val
                total_val_loss += val_loss.item()
                val_squared_error += ((preds - ratings) ** 2).sum().item()
                num_val_samples += ratings.size(0)

        avg_val_loss = total_val_loss / len(val_loader)
        val_rmse = np.sqrt(val_squared_error / num_val_samples)

        scheduler.step(avg_val_loss)

        print(
            f"Epoch {epoch+1}/{epochs} - "
            f"Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f} - "
            f"Train RMSE: {train_rmse:.4f} - Val RMSE: {val_rmse:.4f}"
        )

        # Early Stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), save_path)
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print("Early stopping triggered.")
                break

    model.load_state_dict(torch.load(save_path))
    return model


In [10]:
# create dataset and dataloader 
train_dataset = RatingsDataset(train_data)
val_dataset = RatingsDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

# define model initialize the offsets
model = MatrixFactorization(n_users, n_books, EMBED_DIM).to(DEVICE)
with torch.no_grad():
    model.user_bias.weight.data = offsets_tensor.unsqueeze(1).clone()
    
optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=2, verbose=True
)

# train the model  
model = train_model(
    model,
    train_loader,
    val_loader,
    lambda pred, target, model, reg_lambda: loss_fn(pred, target, model, reg_lambda),
    optimizer,
    scheduler,
    EPOCHS,
    PATIENCE,
    "best_model.pt",
)




Epoch 1/100 - Train Loss: 1.1538 - Val Loss: 0.8312 - Train RMSE: 0.8875 - Val RMSE: 0.9125
Epoch 2/100 - Train Loss: 0.7873 - Val Loss: 0.8290 - Train RMSE: 0.8855 - Val RMSE: 0.9114
Epoch 3/100 - Train Loss: 0.7837 - Val Loss: 0.8264 - Train RMSE: 0.8835 - Val RMSE: 0.9099
Epoch 4/100 - Train Loss: 0.7803 - Val Loss: 0.8240 - Train RMSE: 0.8816 - Val RMSE: 0.9086
Epoch 5/100 - Train Loss: 0.7771 - Val Loss: 0.8216 - Train RMSE: 0.8796 - Val RMSE: 0.9073
Epoch 6/100 - Train Loss: 0.7746 - Val Loss: 0.8195 - Train RMSE: 0.8777 - Val RMSE: 0.9061
Epoch 7/100 - Train Loss: 0.7735 - Val Loss: 0.8168 - Train RMSE: 0.8757 - Val RMSE: 0.9046
Epoch 8/100 - Train Loss: 0.7756 - Val Loss: 0.8124 - Train RMSE: 0.8734 - Val RMSE: 0.9022
Epoch 9/100 - Train Loss: 0.7808 - Val Loss: 0.8014 - Train RMSE: 0.8693 - Val RMSE: 0.8960
Epoch 10/100 - Train Loss: 0.7841 - Val Loss: 0.7750 - Train RMSE: 0.8595 - Val RMSE: 0.8812
Epoch 11/100 - Train Loss: 0.7790 - Val Loss: 0.7361 - Train RMSE: 0.8401 - Val

  model.load_state_dict(torch.load(save_path))


### Submission

In [11]:
# make predictions on the test set
test_df['user_idx'] = test_df['user_id'].map(user2idx)
test_df['book_idx'] = test_df['book_id'].map(book2idx)
test_df["user_idx"] = test_df["user_idx"].fillna(n_users - 1).astype(int)
test_df["book_idx"] = test_df["book_idx"].fillna(n_books - 1).astype(int)

test_users = torch.tensor(test_df["user_idx"].values, dtype=torch.long).to(DEVICE)
test_items = torch.tensor(test_df["book_idx"].values, dtype=torch.long).to(DEVICE)

model.eval()
with torch.no_grad():
    test_preds = model(test_users, test_items).cpu().numpy()

# clip predictions to range (1 to 5)
test_preds = np.clip(test_preds, 1.0, 5.0)

submission = pd.DataFrame({"id": test_df["id"], "rating": test_preds})
submission.to_csv("submission.csv", index=False)
print("Created submission.csv")

Created submission.csv
