In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim

In [1]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [1]:
# Tokenize and convert text to embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = bert_model(**inputs)
    return outputs.pooler_output

In [1]:
# Load the data
movies = pd.read_csv('Dataset-100k/u.item', sep='|', encoding='latin-1', header=None, names=['movieId', 'title', 'release_date', 'video_release_date', 'imdb_url'] + list(range(19)))

ratings = pd.read_csv('Dataset-100k/u.data', sep='\t', header=None, names=['userId', 'movieId', 'rating', 'timestamp'])

# Merge datasets
data = pd.merge(ratings, movies[['movieId', 'title']], on='movieId')

# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [1]:
# Model architecture
class RatingPredictor(nn.Module):
    def __init__(self, input_size):
        super(RatingPredictor, self).__init__()
        self.fc = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.fc(x)

# Hyperparameters
input_size = 768  # Size of BERT embeddings
lr = 0.001
epochs = 5

# Rating Predictor Model, Loss, Optimizer
rating_predictor_model = RatingPredictor(input_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(rating_predictor_model.parameters(), lr=lr)

In [1]:
# Training loop for 200 iterations- can be extended to the whole dataset
max_iterations = 200
for epoch in range(epochs):
    total_loss = 0.0
    for i, (_, row) in enumerate(train_data.iterrows()):
        text = row['title']
        rating = row['rating']

        # Tokenization and embedding
        embedding = get_embeddings(text)

        # Forward pass
        output = rating_predictor_model(embedding)
        loss = criterion(output, torch.tensor([[rating]], dtype=torch.float))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Break the loop if we reach the specified number of iterations
        if i == max_iterations - 1:
            break

    # Print the average loss for this epoch
    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {total_loss / len(train_data)}")

    # Break the outer loop if we reach the specified number of iterations
    if i == max_iterations - 1:
        break

In [1]:
# Evaluation
rating_predictor_model.eval()
predictions = []
labels = []

# Limit the evaluation loop to 200 iterations for testing
max_eval_iterations = 200
for i, (_, row) in enumerate(test_data.iterrows()):
    text = row['title']
    rating = row['rating']

    # Tokenization and embedding
    embedding = get_embeddings(text)

    # Prediction
    output = rating_predictor_model(embedding)
    predictions.append(output.item())
    labels.append(rating)

    # Break the loop if we reach the specified number of iterations
    if i == max_eval_iterations - 1:
        break

# Print predictions and labels during evaluation
for pred, label in zip(predictions, labels):
    print(f"Prediction: {pred}, Label: {label}")

In [1]:
# Calculate RMSE
rmse = ((torch.tensor(predictions) - torch.tensor(labels)) ** 2).mean().sqrt()
print(f"RMSE: {rmse}")

Epoch 1/5, Average Loss: 0.003368861171678262
Prediction: 3.3546266555786133, Label: 3
Prediction: 3.451507568359375, Label: 5
Prediction: 3.2526540756225586, Label: 4
Prediction: 3.448939323425293, Label: 3
Prediction: 3.1001851558685303, Label: 3
Prediction: 3.4318535327911377, Label: 4
Prediction: 3.4206299781799316, Label: 4
Prediction: 3.268927812576294, Label: 1
Prediction: 3.4649946689605713, Label: 4
Prediction: 3.4651589393615723, Label: 4
Prediction: 3.3941872119903564, Label: 5
Prediction: 3.4022653102874756, Label: 4
Prediction: 3.310976505279541, Label: 3
Prediction: 3.0886828899383545, Label: 4
Prediction: 3.2893385887145996, Label: 2
Prediction: 3.4471991062164307, Label: 3
Prediction: 3.4144036769866943, Label: 5
Prediction: 3.477458953857422, Label: 5
Prediction: 3.215217113494873, Label: 3
Prediction: 3.4388082027435303, Label: 2
Prediction: 3.477344512939453, Label: 4
Prediction: 3.425541877746582, Label: 3
Prediction: 3.2217655181884766, Label: 4
Prediction: 3.44524