In [1]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

2024-11-26 02:45:42.465382: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-26 02:45:42.466171: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-26 02:45:42.468911: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-26 02:45:42.476226: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732585542.488410  160208 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732585542.49

In [2]:
import numpy as np
book_text_embeddings = np.load("../data/book_text_embeddings_bge.npz", allow_pickle=True)
book_text_embeddings = book_text_embeddings["embeddings"]
category_embeddings = np.load("../data/category_embeddings.npy", allow_pickle=True)

In [6]:
books = pd.read_csv("../data/extended_books_google_embeddings.csv")
books["text_embeddings"] = book_text_embeddings.tolist()
books["category_embeddings"] = category_embeddings.tolist()

In [12]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

# Identify users unique to train and filter them
train_users = set(train_df["user_id"].unique())
test_users = set(test_df["user_id"].unique())
unique_train_users = train_users - test_users

user_rating_counts = train_df["user_id"].value_counts()
low_activity_users = user_rating_counts.loc[list(unique_train_users)][user_rating_counts < 2].index

train_df = train_df[~train_df["user_id"].isin(low_activity_users)]

print(f"Filtered train users: {len(train_df['user_id'].unique())}")

Filtered train users: 8534


In [13]:
train_df = train_df.merge(books, on="book_id")
train_df.shape

(90152, 20)

In [14]:
train_df["user_id"] = train_df["user_id"].astype(str)
train_df["book_id"] = train_df["book_id"].astype(str)

# Split the data into training and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42)

train_dataset = tf.data.Dataset.from_tensor_slices({
    "user_id": train_data["user_id"].values,
    "book_id": train_data["book_id"].values,
    "rating": train_data["rating"].values.astype('float32'),
    "text_embeddings": train_data["text_embeddings"].tolist(),  # Convert to list for tf.data compatibility
    "category_embeddings": train_data["category_embeddings"].tolist(),
})

val_dataset = tf.data.Dataset.from_tensor_slices({
    "user_id": val_data["user_id"].values,
    "book_id": val_data["book_id"].values,
    "rating": val_data["rating"].values.astype('float32'),
    "text_embeddings": val_data["text_embeddings"].tolist(),
    "category_embeddings": val_data["category_embeddings"].tolist(),
})

W0000 00:00:1732585621.887512  160208 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [15]:
class RatingModelMTL(tfrs.models.Model):
    def __init__(self):
        super().__init__()
        # Vocabulary sizes
        user_vocab_size = len(train_df["user_id"].unique())
        book_vocab_size = len(train_df["book_id"].unique())

        # Embedding dimensions
        embedding_dim_user = min(100, int(user_vocab_size ** 0.5))
        embedding_dim_book = min(100, int(book_vocab_size ** 0.5))

        # User and Book Embeddings
        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=train_df["user_id"].unique(), mask_token=None),
            tf.keras.layers.Embedding(user_vocab_size + 1, embedding_dim_user)
        ])
        self.book_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=train_df["book_id"].unique(), mask_token=None),
            tf.keras.layers.Embedding(book_vocab_size + 1, embedding_dim_book)
        ])

        # Text Embedding Processing
        self.text_embedding_layer = tf.keras.layers.Dense(128, activation="relu")

        # Category Embedding Processing
        self.category_embedding_layer = tf.keras.layers.Dense(128, activation="relu")

        # Shared representation (Deep Component)
        self.shared_model = tf.keras.Sequential([
            tf.keras.layers.Dense(512, activation="relu"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.2)
        ])

        # Regression Head
        self.regression_head = tf.keras.Sequential([
            tf.keras.layers.Dense(32, activation="relu"),
            tf.keras.layers.Dense(1)  # Predict continuous rating
        ])

        # Classification Head
        self.classification_head = tf.keras.Sequential([
            tf.keras.layers.Dense(32, activation="relu"),
            tf.keras.layers.Dense(9, activation="softmax")  # Predict 9 rating classes (1.0 to 5.0)
        ])

        # Task metrics
        self.regression_task = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )
        self.classification_task = tfrs.tasks.Ranking(
            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
        )

    def compute_loss(self, features, training=False):
        # Compute embeddings
        user_embeddings = self.user_embedding(features["user_id"])
        book_embeddings = self.book_embedding(features["book_id"])
        text_embeddings = tf.cast(features["text_embeddings"], tf.float32)
        text_embeddings = self.text_embedding_layer(text_embeddings)
        category_embeddings = tf.cast(features["category_embeddings"], tf.float32)
        category_embeddings = self.category_embedding_layer(category_embeddings)

        # Combine embeddings
        interaction = tf.concat([user_embeddings, book_embeddings, text_embeddings, category_embeddings], axis=1)

        # Shared representation
        shared_output = self.shared_model(interaction)

        # Regression and Classification outputs
        regression_output = self.regression_head(shared_output)
        classification_output = self.classification_head(shared_output)

        # Compute losses
        ratings = tf.cast(features["rating"], tf.float32)
        class_labels = tf.cast(features["rating"] * 2 - 2, tf.int32)  # Map 1.0 → 0, ..., 5.0 → 8
        regression_loss = self.regression_task(labels=ratings, predictions=tf.squeeze(regression_output, axis=-1))
        classification_loss = self.classification_task(labels=class_labels, predictions=classification_output)

        # Combine losses (weighted)
        total_loss = regression_loss + 4 * classification_loss
        return total_loss


In [16]:
model = RatingModelMTL()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), run_eagerly=True)

history = model.fit(
    train_dataset.batch(32),
    validation_data=val_dataset.batch(32),
    epochs=1
)

[1m2536/2536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 84ms/step - loss: 7.7512 - regularization_loss: 0.0000e+00 - root_mean_squared_error: 0.9316 - sparse_categorical_accuracy: 0.2890 - total_loss: 7.7512 - val_loss: 6.9785 - val_root_mean_squared_error: 0.8133 - val_sparse_categorical_accuracy: 0.3485 - val_regularization_loss: 0.0000e+00 - val_total_loss: 6.9785


## Predict

In [17]:
test_df = pd.read_csv("../data/test.csv") 
test_df

Unnamed: 0,id,book_id,user_id
0,0,3786,40484
1,1,1985,47039
2,2,2290,60111
3,3,118657,64447
4,4,1560,2953
...,...,...,...
29362,29362,2802,12312
29363,29363,53552,25725
29364,29364,4065,77178
29365,29365,1290,23201


In [18]:

# Merge with the book_id
test_df = test_df.merge(books, on="book_id", how="left")

In [19]:

test_df["user_id"] = test_df["user_id"].astype(str)
test_df["book_id"] = test_df["book_id"].astype(str)

# Create TensorFlow dataset
test_dataset = tf.data.Dataset.from_tensor_slices({
    "user_id": test_df["user_id"].values,
    "book_id": test_df["book_id"].values,
    "text_embeddings": test_df["text_embeddings"].tolist(),
    "category_embeddings": test_df["category_embeddings"].tolist(),
}).batch(32)  # Batch for efficiency

# Predictions list
predictions = []

# Loop through the test dataset
for batch in test_dataset:
    user_ids = batch["user_id"]
    book_ids = batch["book_id"]

    # Get embeddings
    user_embeddings = model.user_embedding(user_ids)
    book_embeddings = model.book_embedding(book_ids)
    text_embeddings = tf.cast(batch["text_embeddings"], tf.float32)
    text_embeddings = model.text_embedding_layer(text_embeddings)
    category_embeddings = tf.cast(batch["category_embeddings"], tf.float32)
    category_embeddings = model.category_embedding_layer(category_embeddings)

    # Shared representation
    interaction = tf.concat([user_embeddings, book_embeddings, text_embeddings, category_embeddings], axis=1)
    shared_output = model.shared_model(interaction)

    # Use the regression head for predictions
    batch_predictions = model.regression_head(shared_output)

    # Convert to numpy and flatten
    predictions.extend(batch_predictions.numpy().flatten())

# Save predictions
test_df["rating"] = predictions
output_df = test_df[["id", "rating"]]
output_df.to_csv("mtl.csv", index=False)
print("Predictions saved to predictions.csv")


Predictions saved to predictions.csv


2024-11-26 02:52:09.089882: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
