In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
cleaned_books_df = pd.read_csv("../data/extended_books_google_embeddings.csv")

In [None]:
# drop embedding column
# cleaned_books_df = cleaned_books_df.drop(columns=["full_text_embeddings"])

In [None]:
# drop description fulltext title
cleaned_books_df = cleaned_books_df.drop(
    columns=["description", "title", "full_text", "subtitle", "maturityRating"]
)

In [None]:
# extract year from publisheDate
cleaned_books_df["publishedDate"] = cleaned_books_df["publishedDate"].str.extract(
    r"(\d{4})"
)

In [None]:
categorical_columns = [
    "authors",
    "publisher",
    "language",
    "categories",
    "publishedDate",
]

for col in categorical_columns:
    cleaned_books_df[col].fillna("Unknown", inplace=True)

multi_valued_columns = ["categories", "authors"]

for (
    col
) in (
    multi_valued_columns
):  # convert to a single number 1 number in the column not a list
    le = LabelEncoder()
    cleaned_books_df[col] = le.fit_transform(cleaned_books_df[col])

In [None]:
# # convert to numeric
# cleaned_books_df["publisher"] = pd.to_numeric(cleaned_books_df["publisher"], errors="coerce")
# cleaned_books_df["language"] = pd.to_numeric(cleaned_books_df["language"], errors="coerce")

# # fill missing for all columns
# cleaned_books_df = cleaned_books_df.fillna(0)

# # standardize
# scaler = StandardScaler()
# cleaned_books_df = pd.DataFrame(
#     scaler.fit_transform(cleaned_books_df), columns=cleaned_books_df.columns
# )

In [None]:
# merge
train_df = train_df.merge(cleaned_books_df, on="book_id", how="left")
test_df = test_df.merge(cleaned_books_df, on="book_id", how="left")

In [None]:
train_df

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
import torch.optim as optim


class BookRatingsDataset(Dataset):
    def __init__(self, df, is_test=False):
        self.is_test = is_test
        self.book_ids = torch.tensor(df["book_id"].values, dtype=torch.long)
        self.user_ids = torch.tensor(df["user_id"].values, dtype=torch.long)
        self.pageCounts = torch.tensor(df["pageCount"].values, dtype=torch.float)
        self.ratingsCount = torch.tensor(df["ratingsCount"].values, dtype=torch.float)
        self.averageRating = torch.tensor(df["averageRating"].values, dtype=torch.float)
        self.publishDate = torch.tensor(df["publishedDate"].values, dtype=torch.long)

        # Multi-valued features stored as numpy arrays
        self.categories = df["categories"].values
        self.authors = df["authors"].values

        if not is_test:
            self.ratings = torch.tensor(df["rating"].values, dtype=torch.float)

    def __len__(self):
        return len(self.book_ids)

    def __getitem__(self, idx):
        book_id = self.book_ids[idx]
        user_id = self.user_ids[idx]
        pages = self.pageCounts[idx]
        ratings = self.ratingsCount[idx]
        average_rating = self.averageRating[idx]
        publish_date = self.publishDate[idx]

        categories = self.categories[idx]
        authors = self.authors[idx]

        if self.is_test:
            return (
                book_id,
                user_id,
                pages,
                ratings,
                average_rating,
                publish_date,
                categories,
                authors,
            )
        else:
            rating = self.ratings[idx]
            return (
                book_id,
                user_id,
                pages,
                ratings,
                average_rating,
                publish_date,
                categories,
                authors,
                rating,
            )


class BookRatingPredictor(nn.Module):
    def __init__(self, n_books, n_users, n_genres, n_authors, embedding_size=50):
        super(BookRatingPredictor, self).__init__()
        self.book_embedding = nn.Embedding(n_books, embedding_size)
        self.user_embedding = nn.Embedding(n_users, embedding_size)
        self.genre_embedding = nn.Embedding(n_genres, embedding_size)
        self.author_embedding = nn.Embedding(n_authors, embedding_size)

        # Fully connected layers
        self.fc1 = nn.Linear(embedding_size * 4 + 4, 128)  # Adjust input size
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

        self.dropout = nn.Dropout(0.3)

    def forward(
        self,
        book_ids,
        user_ids,
        pages,
        ratings_count,
        average_rating,
        publish_date,
        genres,
        authors,
    ):
        # Embeddings
        book_embeds = self.book_embedding(book_ids)
        user_embeds = self.user_embedding(user_ids)
        genre_embeds = self.genre_embedding(genres)
        author_embeds = self.author_embedding(authors)

        pages = pages.unsqueeze(1)  # From shape [batch_size] to [batch_size, 1]
        ratings_count = ratings_count.unsqueeze(1)
        average_rating = average_rating.unsqueeze(1)
        publish_date = publish_date.unsqueeze(1)

        # Concatenate all features
        x = torch.cat(
            [
                book_embeds,
                user_embeds,
                genre_embeds,
                author_embeds,
                pages,
                ratings_count,
                average_rating,
                publish_date,
            ],
            dim=1,
        )

        # Fully connected layers
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x.squeeze()

In [None]:
max_book_id = train_df["book_id"].max()
max_user_id = train_df["user_id"].max()
num_genres = train_df["categories"].max() + 1
num_authors = train_df["authors"].max() + 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = BookRatingPredictor(
    max_book_id + 1, max_user_id + 1, num_genres, num_authors, embedding_size=50
)
model.to(device)

In [None]:
# RMSE Loss
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


def train_model(model, train_loader, criterion, optimizer, num_epochs=5, device=device):
    model.train()  # Set the model to training mode
    for epoch in range(num_epochs):
        total_loss = 0
        for data in train_loader:
            # Unpack the data from the DataLoader
            (
                book_ids,
                user_ids,
                pages,
                ratings_count,
                average_rating,
                publish_date,
                genres,
                authors,
                ratings,
            ) = data

            # Move tensors to the appropriate device
            book_ids = book_ids.to(device)
            user_ids = user_ids.to(device)
            pages = pages.to(device)
            ratings_count = ratings_count.to(device)
            average_rating = average_rating.to(device)
            publish_date = publish_date.to(device)
            genres = genres.to(device)
            authors = authors.to(device)
            ratings = ratings.to(device)

            optimizer.zero_grad()  # Zero out the gradients

            # Forward pass through the model
            outputs = model(
                book_ids,
                user_ids,
                pages,
                ratings_count,
                average_rating,
                publish_date,
                genres,
                authors,
            )

            # Calculate loss
            loss = criterion(outputs, ratings)

            # Backpropagation and optimization step
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Compute and print RMSE for the epoch
        rmse = torch.sqrt(torch.tensor(total_loss / len(train_loader))).item()
        print(f"Epoch [{epoch+1}/{num_epochs}], RMSE: {rmse}")

In [None]:
train_dataset = BookRatingsDataset(train_df)
test_dataset = BookRatingsDataset(test_df, is_test=True)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [None]:
train_model(model, train_loader, criterion, optimizer, num_epochs=29)

In [None]:
def predict(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    with torch.no_grad():  # Disable gradient calculation for efficiency
        for data in test_loader:
            (
                book_ids,
                user_ids,
                pages,
                genres,
                authors,
                publishers,
                languages,
                publish_dates,
            ) = data
            # Move data to the appropriate device
            book_ids, user_ids, pages = (
                book_ids.to(device),
                user_ids.to(device),
                pages.to(device),
            )
            genres, authors = genres.to(device), authors.to(device)
            publishers, languages = publishers.to(device), languages.to(device)
            publish_dates = publish_dates.to(device)

            outputs = model(
                book_ids,
                user_ids,
                pages,
                genres,
                authors,
                publishers,
                languages,
                publish_dates,
            )
            predictions.extend(
                outputs.cpu().numpy()
            )  # Move predictions to CPU and convert to numpy array
    return predictions


# Prepare the test DataLoader
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Generate predictions
predictions = predict(model, test_loader, device)

In [None]:
# save predicitons to csv file in the format
# id,rating
test_df["rating"] = predictions
test_df[["id", "rating"]].to_csv("submission.csv", index=False)

In [None]:
from surprise import (
    Dataset,
    Reader,
    SVD,
    KNNBasic,
    KNNWithMeans,
    KNNWithZScore,
    KNNBaseline,
    NMF,
    SlopeOne,
    CoClustering,
    BaselineOnly,
    NormalPredictor,
)
from surprise.model_selection import cross_validate
import pandas as pd

# List of models to test
models = [
    ("SVD", SVD()),
    ("KNNBasic", KNNBasic()),
    ("KNNWithZScore", KNNWithZScore()),
    ("KNNBaseline", KNNBaseline()),
    ("NMF", NMF()),
    ("SlopeOne", SlopeOne()),
    ("CoClustering", CoClustering()),
    ("BaselineOnly", BaselineOnly()),
    ("NormalPredictor", NormalPredictor()),
]

# Prepare the data
reader = Reader(rating_scale=(train_df["rating"].min(), train_df["rating"].max()))
data = Dataset.load_from_df(train_df[["user_id", "book_id", "rating"]], reader)

# Iterate over models and cross-validate
results = []
for name, model in models:
    print(f"Evaluating {name}...")
    cv_results = cross_validate(model, data, measures=["RMSE"], cv=5, verbose=True)
    mean_rmse = cv_results["test_rmse"].mean()
    results.append((name, mean_rmse))

# Sort models by RMSE
results.sort(key=lambda x: x[1])

# Display the results
for name, rmse in results:
    print(f"{name}: RMSE = {rmse:.4f}")