In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [3]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
cleaned_books_df = pd.read_csv("../data/extended_books_google_embeddings.csv")

In [4]:
# drop embedding column
# cleaned_books_df = cleaned_books_df.drop(columns=["full_text_embeddings"])

In [5]:
cleaned_books_df["authors"][1]

'Gina Bari Kolata'

In [6]:
# fill nan with empty string
cleaned_books_df["categories"] = cleaned_books_df["authors"].fillna("")

In [7]:
cleaned_books_df["categories"].value_counts()

categories
                             3664
Stephen King                  159
Nora Roberts                  113
Danielle Steel                 94
John Grisham                   68
                             ... 
Kerrelyn Sparks                 1
Tom Holland                     1
Jenny Downham                   1
Esi Edugyan                     1
Gilles Néret,Gustav Klimt       1
Name: count, Length: 4323, dtype: int64

In [8]:
# fill with empty string
cleaned_books_df["categories"] = cleaned_books_df["categories"].fillna("")

In [9]:
cleaned_books_df["publisher"].value_counts()

publisher
Penguin                           775
Harper Collins                    539
Macmillan                         484
Bantam                            459
Simon and Schuster                441
                                 ... 
HarpPerenM                          1
Folio                               1
New York : Toronto : Doubleday      1
U of Nebraska Press                 1
Spectra Books                       1
Name: count, Length: 986, dtype: int64

In [15]:
train_df["user_id"].value_counts().keys().max()

94399

In [23]:
# fill with empty string
cleaned_books_df["publisher"] = cleaned_books_df["publisher"].fillna("")

In [25]:
# check the max length of a list in author column
max_len = max(cleaned_books_df["publisher"].apply(lambda x: len(x.split(","))))
max_len

3

In [19]:
max_len

3

In [4]:
# drop description fulltext title
cleaned_books_df = cleaned_books_df.drop(
    columns=["description", "title", "full_text", "subtitle", "maturityRating"]
)

In [5]:
# extract year from publisheDate
cleaned_books_df["publishedDate"] = cleaned_books_df["publishedDate"].str.extract(
    r"(\d{4})"
)

In [6]:
categorical_columns = [
    "authors",
    "publisher",
    "language",
    "categories",
]

# impute publishDate with most common
cleaned_books_df["publishedDate"].fillna(
    cleaned_books_df["publishedDate"].mode()[0], inplace=True
)

# convert it to int
cleaned_books_df["publishedDate"] = cleaned_books_df["publishedDate"].astype(int)


for col in categorical_columns:
    cleaned_books_df[col].fillna("Unknown", inplace=True)

multi_valued_columns = ["categories", "authors"]

for (
    col
) in (
    multi_valued_columns
):  # convert to a single number 1 number in the column not a list
    le = LabelEncoder()
    cleaned_books_df[col] = le.fit_transform(cleaned_books_df[col])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_books_df["publishedDate"].fillna(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_books_df[col].fillna("Unknown", inplace=True)


In [None]:
# impute numeric columns with mean ratingsCount averageRating pageCount

cleaned_books_df["ratingsCount"].fillna(
    cleaned_books_df["ratingsCount"].mean(), inplace=True
)

cleaned_books_df["averageRating"].fillna(
    cleaned_books_df["averageRating"].mean(), inplace=True
)

cleaned_books_df["pageCount"].fillna(cleaned_books_df["pageCount"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_books_df["ratingsCount"].fillna(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cleaned_books_df["averageRating"].fillna(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always beh

In [10]:
# # convert to numeric
# cleaned_books_df["publisher"] = pd.to_numeric(cleaned_books_df["publisher"], errors="coerce")
# cleaned_books_df["language"] = pd.to_numeric(cleaned_books_df["language"], errors="coerce")

# # fill missing for all columns
# cleaned_books_df = cleaned_books_df.fillna(0)

# # standardize
# scaler = StandardScaler()
# cleaned_books_df = pd.DataFrame(
#     scaler.fit_transform(cleaned_books_df), columns=cleaned_books_df.columns
# )

In [11]:
# merge
train_df = train_df.merge(cleaned_books_df, on="book_id", how="left")
test_df = test_df.merge(cleaned_books_df, on="book_id", how="left")

In [12]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
import torch.optim as optim


class BookRatingsDataset(Dataset):
    def __init__(self, df, is_test=False):
        self.is_test = is_test
        self.book_ids = torch.tensor(df["book_id"].values, dtype=torch.long)
        self.user_ids = torch.tensor(df["user_id"].values, dtype=torch.long)
        self.pageCounts = torch.tensor(df["pageCount"].values, dtype=torch.float)
        self.ratingsCount = torch.tensor(df["ratingsCount"].values, dtype=torch.float)
        self.averageRating = torch.tensor(df["averageRating"].values, dtype=torch.float)
        # convert from str to int
        self.publishDate = torch.tensor(df["publishedDate"].values, dtype=torch.float)

        # Multi-valued features stored as numpy arrays
        self.categories = df["categories"].values
        self.authors = df["authors"].values

        if not is_test:
            self.ratings = torch.tensor(df["rating"].values, dtype=torch.float)

    def __len__(self):
        return len(self.book_ids)

    def __getitem__(self, idx):
        book_id = self.book_ids[idx]
        user_id = self.user_ids[idx]
        pages = self.pageCounts[idx]
        ratings = self.ratingsCount[idx]
        average_rating = self.averageRating[idx]
        publish_date = self.publishDate[idx]

        categories = self.categories[idx]
        authors = self.authors[idx]

        if self.is_test:
            return (
                book_id,
                user_id,
                pages,
                ratings,
                average_rating,
                publish_date,
                categories,
                authors,
            )
        else:
            rating = self.ratings[idx]
            return (
                book_id,
                user_id,
                pages,
                ratings,
                average_rating,
                publish_date,
                categories,
                authors,
                rating,
            )


class BookRatingPredictor(nn.Module):
    def __init__(self, n_books, n_users, n_genres, n_authors, embedding_size=50):
        super(BookRatingPredictor, self).__init__()
        self.book_embedding = nn.Embedding(n_books, embedding_size)
        self.user_embedding = nn.Embedding(n_users, embedding_size)
        self.genre_embedding = nn.Embedding(n_genres, embedding_size)
        self.author_embedding = nn.Embedding(n_authors, embedding_size)

        # Fully connected layers
        self.fc1 = nn.Linear(embedding_size * 4 + 4, 128)  # Adjust input size
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

        self.dropout = nn.Dropout(0.3)

    def forward(
        self,
        book_ids,
        user_ids,
        pages,
        ratings_count,
        average_rating,
        publish_date,
        genres,
        authors,
    ):
        # Embeddings
        book_embeds = self.book_embedding(book_ids)
        user_embeds = self.user_embedding(user_ids)
        genre_embeds = self.genre_embedding(genres)
        author_embeds = self.author_embedding(authors)

        pages = pages.unsqueeze(1)  # From shape [batch_size] to [batch_size, 1]
        ratings_count = ratings_count.unsqueeze(1)
        average_rating = average_rating.unsqueeze(1)
        publish_date = publish_date.unsqueeze(1)

        # Concatenate all features
        x = torch.cat(
            [
                book_embeds,
                user_embeds,
                genre_embeds,
                author_embeds,
                pages,
                ratings_count,
                average_rating,
                publish_date,
            ],
            dim=1,
        )

        # Fully connected layers
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x.squeeze()

In [13]:
max_book_id = train_df["book_id"].max()
max_user_id = train_df["user_id"].max()
num_genres = train_df["categories"].max() + 1
num_authors = train_df["authors"].max() + 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = BookRatingPredictor(
    max_book_id + 1, max_user_id + 1, num_genres, num_authors, embedding_size=50
)
model.to(device)

Using device: cuda


BookRatingPredictor(
  (book_embedding): Embedding(249243, 50)
  (user_embedding): Embedding(94400, 50)
  (genre_embedding): Embedding(972, 50)
  (author_embedding): Embedding(4323, 50)
  (fc1): Linear(in_features=204, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [14]:
# RMSE Loss
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


def train_model(model, train_loader, criterion, optimizer, num_epochs=5, device=device):
    model.train()  # Set the model to training mode
    for epoch in range(num_epochs):
        total_loss = 0
        for data in train_loader:
            # Unpack the data from the DataLoader
            (
                book_ids,
                user_ids,
                pages,
                ratings_count,
                average_rating,
                publish_date,
                genres,
                authors,
                ratings,
            ) = data

            # Move tensors to the appropriate device
            book_ids = book_ids.to(device)
            user_ids = user_ids.to(device)
            pages = pages.to(device)
            ratings_count = ratings_count.to(device)
            average_rating = average_rating.to(device)
            publish_date = publish_date.to(device)
            genres = genres.to(device)
            authors = authors.to(device)
            ratings = ratings.to(device)

            optimizer.zero_grad()  # Zero out the gradients

            # Forward pass through the model
            outputs = model(
                book_ids,
                user_ids,
                pages,
                ratings_count,
                average_rating,
                publish_date,
                genres,
                authors,
            )

            # Calculate loss
            loss = criterion(outputs, ratings)

            # Backpropagation and optimization step
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Compute and print RMSE for the epoch
        rmse = torch.sqrt(torch.tensor(total_loss / len(train_loader))).item()
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss}, RMSE: {rmse}")

In [15]:
train_dataset = BookRatingsDataset(train_df)
test_dataset = BookRatingsDataset(test_df, is_test=True)

In [16]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [17]:
train_model(model, train_loader, criterion, optimizer, num_epochs=29)

Epoch [1/29], Loss: 2055.391490340233, RMSE: 1.6170967817306519
Epoch [2/29], Loss: 1007.1771611571312, RMSE: 1.1319875717163086
Epoch [3/29], Loss: 830.8409435153008, RMSE: 1.0281291007995605
Epoch [4/29], Loss: 652.6757103204727, RMSE: 0.9112498164176941
Epoch [5/29], Loss: 555.1399901211262, RMSE: 0.8404076099395752
Epoch [6/29], Loss: 498.9832522571087, RMSE: 0.7967677116394043
Epoch [7/29], Loss: 454.11483454704285, RMSE: 0.7601014971733093
Epoch [8/29], Loss: 422.8504247367382, RMSE: 0.7334696054458618


KeyboardInterrupt: 

In [18]:
def predict(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    with torch.no_grad():  # Disable gradient calculation for efficiency
        for data in test_loader:
            (
                book_ids,
                user_ids,
                pages,
                genres,
                authors,
                publishers,
                languages,
                publish_dates,
            ) = data
            # Move data to the appropriate device
            book_ids, user_ids, pages = (
                book_ids.to(device),
                user_ids.to(device),
                pages.to(device),
            )
            genres, authors = genres.to(device), authors.to(device)
            publishers, languages = publishers.to(device), languages.to(device)
            publish_dates = publish_dates.to(device)

            outputs = model(
                book_ids,
                user_ids,
                pages,
                genres,
                authors,
                publishers,
                languages,
                publish_dates,
            )
            predictions.extend(
                outputs.cpu().numpy()
            )  # Move predictions to CPU and convert to numpy array
    return predictions


# Prepare the test DataLoader
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Generate predictions
predictions = predict(model, test_loader, device)

In [19]:
predictions

[1.3357809,
 1.2703103,
 1.0420502,
 1.7972617,
 1.699336,
 1.192356,
 1.252356,
 1.0769513,
 2.7856808,
 1.2738022,
 1.310165,
 1.5013684,
 1.1455977,
 1.2593044,
 1.7285088,
 1.2978414,
 1.0803208,
 1.1061174,
 1.1733896,
 1.8144957,
 1.1902926,
 4.23983,
 2.1197567,
 1.4649922,
 1.031712,
 1.1347464,
 2.016117,
 1.8081076,
 1.4784185,
 1.5264933,
 1.4216475,
 1.0130728,
 2.0254185,
 1.2532557,
 1.3089827,
 1.316214,
 1.5891939,
 2.5812736,
 1.6110508,
 0.91034603,
 2.3699665,
 0.93539375,
 1.1965493,
 1.8261071,
 0.7601313,
 1.6999717,
 1.5765573,
 1.3336707,
 1.1921328,
 1.188313,
 0.9232527,
 1.691069,
 2.1097097,
 0.94479734,
 1.6309656,
 2.3354034,
 1.3194557,
 1.4834102,
 1.9687127,
 1.5144086,
 1.8064224,
 1.1272494,
 1.0132694,
 3.1494503,
 1.3902146,
 1.7022454,
 1.7392749,
 1.1470248,
 1.0024407,
 1.5537982,
 1.3734666,
 1.2432529,
 0.9533769,
 1.309085,
 1.2835637,
 0.7563098,
 2.0063636,
 1.6071825,
 1.3256527,
 1.4435061,
 1.2884718,
 1.2306424,
 1.599623,
 1.5087966,
 2

In [None]:
# save predicitons to csv file in the format
# id,rating
test_df["rating"] = predictions
test_df[["id", "rating"]].to_csv("submission.csv", index=False)

In [None]:
from surprise import (
    Dataset,
    Reader,
    SVD,
    KNNBasic,
    KNNWithMeans,
    KNNWithZScore,
    KNNBaseline,
    NMF,
    SlopeOne,
    CoClustering,
    BaselineOnly,
    NormalPredictor,
)
from surprise.model_selection import cross_validate
import pandas as pd

# List of models to test
models = [
    ("SVD", SVD()),
    ("KNNBasic", KNNBasic()),
    ("KNNWithZScore", KNNWithZScore()),
    ("KNNBaseline", KNNBaseline()),
    ("NMF", NMF()),
    ("CoClustering", CoClustering()),
    ("BaselineOnly", BaselineOnly()),
    ("NormalPredictor", NormalPredictor()),
]

# Prepare the data
reader = Reader(rating_scale=(train_df["rating"].min(), train_df["rating"].max()))
data = Dataset.load_from_df(train_df[["user_id", "book_id", "rating"]], reader)

# Iterate over models and cross-validate
results = []
for name, model in models:
    print(f"Evaluating {name}...")
    cv_results = cross_validate(model, data, measures=["RMSE"], cv=5, verbose=True)
    mean_rmse = cv_results["test_rmse"].mean()
    results.append((name, mean_rmse))

# Sort models by RMSE
results.sort(key=lambda x: x[1])

# Display the results
for name, rmse in results:
    print(f"{name}: RMSE = {rmse:.4f}")

Evaluating SVD...
Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9273  0.9324  0.9262  0.9278  0.9306  0.9289  0.0023  
Fit time          1.01    1.06    1.07    1.04    1.07    1.05    0.02    
Test time         0.09    0.07    0.25    0.07    0.06    0.11    0.07    
Evaluating KNNBasic...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1397  1.1417  1.1437  1.1506  1.1429  1.1437  0.0037  
Fit time          2.24    2.65    2.39   

: 