This file Contains code to download the goodread chlidrens book data.

# Download datasets

### Download the goodread data set. Most codes are directly from sample code provided by the data collector, see [their page](https://https://github.com/MengtingWan/goodreads) and the paper body for details.

In [1]:
import pandas as pd
import requests
import os
import json
from google.colab import files
import gzip
from datetime import datetime

In [2]:
def download(local_filename):
      url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/byGenre/goodreads_books_children.json.gz"
      with requests.get(url, stream=True) as r:
          r.raise_for_status()
          with open(local_filename, 'wb') as f:
              for chunk in r.iter_content(chunk_size=8192):
                  f.write(chunk)
      print('Dataset', "goodreads_books_children.json.gz", 'has been downloaded!')

In [3]:
OUT_DIR = './genre'
if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

output_path = os.path.join(OUT_DIR, 'goodreads_books_children.json.gz')
download(output_path)

Dataset goodreads_books_children.json.gz has been downloaded!


In [4]:
# load data
def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)

            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return data

In [5]:
# Function to flatten nested data
def flatten_book_data(book):
    flat_data = {
        "isbn": book.get("isbn", ""),
        "title": book.get("title", ""),
        "average_rating": book.get("average_rating", ""),
        "format": book.get("format", ""),
        "publisher": book.get("publisher", ""),
        "publication_year": book.get("publication_year", ""),
        "num_pages": book.get("num_pages", ""),
        "ratings_count": book.get("ratings_count", ""),
        "text_reviews_count": book.get("text_reviews_count", ""),
        "link": book.get("link", ""),
        "authors": "; ".join([f'{a["author_id"]} ({a["role"]})' for a in book.get("authors", [])]),
        "popular_shelves": "; ".join([shelf["name"] for shelf in book.get("popular_shelves", [])]),
    }
    return flat_data

In [6]:
book_data = load_data(output_path,head = None)

# Flatten the data
if isinstance(book_data, list):
    flattened_data = [flatten_book_data(book) for book in book_data]
else:
    flattened_data = flatten_book_data(book_data)

### Save this data to local device
Code below download the data to this device.

In [None]:
# Save raw data
def process_in_chunks(book_data, chunk_size=1000, output_file="./genre/raw_books_chunk.parquet"):
    """Process the data in chunks to reduce memory consumption and append to Parquet."""
    chunk = []

    # Start by writing the first chunk to the Parquet file
    for i, book in enumerate(book_data):
        # Flatten the book data
        flattened_data = flatten_book_data(book)
        chunk.append(flattened_data)

        # When chunk size is reached or it's the last chunk, process and clear memory
        if (i + 1) % chunk_size == 0 or i + 1 == len(book_data):
            # Convert to DataFrame
            df_chunk = pd.DataFrame(chunk)

            # Check if the file already exists
            try:
                # Try reading the existing Parquet file to append
                existing_df = pd.read_parquet(output_file)
                # Concatenate the new chunk to the existing data
                df_chunk = pd.concat([existing_df, df_chunk], ignore_index=True)
            except FileNotFoundError:
                # If the file doesn't exist, this is the first chunk to write
                pass

            # Write the concatenated DataFrame to Parquet (overwrites the file if it exists)
            df_chunk.to_parquet(output_file, index=False)

            # Clear the chunk list to free memory
            chunk = []
            print(f"Processed {i + 1} books.")

raw_parquet_file = "./genre/raw_books_chunk.parquet"
process_in_chunks(book_data, output_file=raw_parquet_file)
files.download(raw_parquet_file)

Processed 1000 books.
Processed 2000 books.
Processed 3000 books.
Processed 4000 books.
Processed 5000 books.
Processed 6000 books.
Processed 7000 books.
Processed 8000 books.
Processed 9000 books.
Processed 10000 books.
Processed 11000 books.
Processed 12000 books.
Processed 13000 books.
Processed 14000 books.
Processed 15000 books.
Processed 16000 books.
Processed 17000 books.
Processed 18000 books.
Processed 19000 books.
Processed 20000 books.
Processed 21000 books.
Processed 22000 books.
Processed 23000 books.
Processed 24000 books.
Processed 25000 books.
Processed 26000 books.
Processed 27000 books.
Processed 28000 books.
Processed 29000 books.
Processed 30000 books.
Processed 31000 books.
Processed 32000 books.
Processed 33000 books.
Processed 34000 books.
Processed 35000 books.
Processed 36000 books.
Processed 37000 books.
Processed 38000 books.
Processed 39000 books.
Processed 40000 books.
Processed 41000 books.
Processed 42000 books.
Processed 43000 books.
Processed 44000 book

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(len(book_data))
print(len(book_data[0]))

124082
29


There should be 124082 books and 29 attributes.

# Clean data

In [108]:
# remove unused columns
columns_to_keep = [
    'text_reviews_count', 'country_code', 'average_rating', 'num_pages',
    'ratings_count', 'title', 'publication_year', 'publication_month', 'publication_day'
]

def is_valid_date(date_string, date_format="%Y-%m-%d"):
    try:
        datetime.strptime(date_string, date_format)
        return True
    except ValueError:
        return False


def clean_book_data(books_data, columns_to_keep):
    cleaned_books = []
    required_columns = set(columns_to_keep)
    seen_titles_ratings = set()
    TODAY_YEAR = 2024
    TODAY_MONTH = 12
    TODAY_DAY = 1
    clean = True

    for book in books_data:
        clean = True

        # Check if all required columns exist
        if not required_columns.issubset(book.keys()):
            continue  # Skip the book if required columns are missing

        # Check if any required column has an empty string value
        if any(book[column] == '' for column in required_columns):
            continue  # Skip the book if any column has an empty string

        # Filter only the relevant columns and remove entries with empty string values
        cleaned_book = {key: book[key] for key in required_columns if key in book and book[key] != ''}

        # Clean other fields
        cleaned_book["text_reviews_count"] = int(cleaned_book.get("text_reviews_count", "0"))
        cleaned_book["average_rating"] = float(cleaned_book.get("average_rating", "0.00"))
        cleaned_book["num_pages"] = int(cleaned_book.get("num_pages", "0"))
        cleaned_book["ratings_count"] = int(cleaned_book.get("ratings_count", "0"))
        cleaned_book["publication_year"] = int(cleaned_book.get("publication_year", "0"))
        cleaned_book["publication_month"] = int(cleaned_book.get("publication_month", "0"))
        cleaned_book["publication_day"] = int(cleaned_book.get("publication_day", "0"))

        # Check if the (title, average_rating) pair has been seen already (i.e., check for duplicates)
        title_rating_pair = (cleaned_book["title"], cleaned_book["average_rating"])
        if title_rating_pair not in seen_titles_ratings and float(title_rating_pair[1]) >= 1:
            seen_titles_ratings.add(title_rating_pair)
        else:
          clean = False

        if cleaned_book["publication_year"] > TODAY_YEAR:
          clean = False
        elif cleaned_book["publication_year"] == TODAY_YEAR and cleaned_book["publication_month"] > TODAY_MONTH:
          clean = False
        elif cleaned_book["publication_year"] == TODAY_YEAR and cleaned_book["publication_month"] == TODAY_MONTH and cleaned_book["publication_day"] > TODAY_DAY:
          clean = False

        if cleaned_book["ratings_count"] == 0:
          clean = False

        if cleaned_book["ratings_count"] < cleaned_book["text_reviews_count"]:
          clean = False

        date_str = str(cleaned_book["publication_year"]) + "-" + str(cleaned_book["publication_month"]) + "-" + str(cleaned_book["publication_day"])
        if not is_valid_date(date_str):
          clean = False

        if clean:
          # Add the cleaned book if it passes condictions
          cleaned_books.append(cleaned_book)

    return cleaned_books

cleaned_book_data = clean_book_data(book_data, columns_to_keep)

In [109]:
print(len(cleaned_book_data))
print(cleaned_book_data[0])

65118
{'country_code': 'US', 'title': 'The Aeneid for Boys and Girls', 'num_pages': 162, 'publication_year': 2006, 'average_rating': 4.13, 'text_reviews_count': 7, 'publication_month': 9, 'ratings_count': 46, 'publication_day': 13}


### Save Cleaned data to local device
Code below saves data to this device.

In [9]:
# Save cleaned data
def process_in_chunks(book_data, chunk_size=1000, output_file="./genre/cleaned_books_chunk.parquet"):
    """Process the data in chunks to reduce memory consumption and append to Parquet."""
    chunk = []

    #remove file if exists
    if os.path.exists(output_file):
      os.remove(output_file)

    # Start by writing the first chunk to the Parquet file
    for i, book in enumerate(book_data):
        chunk.append(book_data[i])

        # When chunk size is reached or it's the last chunk, process and clear memory
        if (i + 1) % chunk_size == 0 or i + 1 == len(book_data):
            # Convert to DataFrame
            df_chunk = pd.DataFrame(chunk)

            # Check if the file already exists
            try:
                # Try reading the existing Parquet file to append
                existing_df = pd.read_parquet(output_file)
                # Concatenate the new chunk to the existing data
                df_chunk = pd.concat([existing_df, df_chunk], ignore_index=True)
            except FileNotFoundError:
                # If the file doesn't exist, this is the first chunk to write
                pass

            # Write the concatenated DataFrame to Parquet (overwrites the file if it exists)
            df_chunk.to_parquet(output_file, index=False)

            # Clear the chunk list to free memory
            chunk = []
            print(f"Processed {i + 1} books.")

cleaned_parquet_file = "./genre/cleaned_books.parquet"
process_in_chunks(cleaned_book_data, output_file=cleaned_parquet_file)
files.download(cleaned_parquet_file)

Processed 1000 books.
Processed 2000 books.
Processed 3000 books.
Processed 4000 books.
Processed 5000 books.
Processed 6000 books.
Processed 7000 books.
Processed 8000 books.
Processed 9000 books.
Processed 10000 books.
Processed 11000 books.
Processed 12000 books.
Processed 13000 books.
Processed 14000 books.
Processed 15000 books.
Processed 16000 books.
Processed 17000 books.
Processed 18000 books.
Processed 19000 books.
Processed 20000 books.
Processed 21000 books.
Processed 22000 books.
Processed 23000 books.
Processed 24000 books.
Processed 25000 books.
Processed 26000 books.
Processed 27000 books.
Processed 28000 books.
Processed 29000 books.
Processed 30000 books.
Processed 31000 books.
Processed 32000 books.
Processed 33000 books.
Processed 34000 books.
Processed 35000 books.
Processed 36000 books.
Processed 37000 books.
Processed 38000 books.
Processed 39000 books.
Processed 40000 books.
Processed 41000 books.
Processed 42000 books.
Processed 43000 books.
Processed 44000 book

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Test Data
This section tests if the data are cleaned and ready for use

In [10]:
def test_cleaned_books_data(books_data, columns_to_keep):
    # Check that each book contains all required columns
    required_columns = set(columns_to_keep)

    for idx, book in enumerate(books_data):
        # Check if all required columns exist
        if not required_columns.issubset(book.keys()):
            print(f"Book {idx} is missing required columns: {required_columns - set(book.keys())}")
            return False

        # Check if any column has an empty string value
        for column in required_columns:
            if book[column] == '':
                print(f"Book {idx} has an empty string in column '{column}'")
                return False

    print("Data is clean and ready for use!")
    return True


# Columns to check against
columns_to_keep = [
    'text_reviews_count', 'country_code', 'average_rating', 'num_pages',
    'ratings_count', 'title', 'publication_year', 'publication_month', 'publication_day'
]

# Run the test
is_data_clean = test_cleaned_books_data(cleaned_book_data, columns_to_keep)

# Print the result
print("Is the data clean:", is_data_clean)

Data is clean and ready for use!
Is the data clean: True


In [11]:
print(len(cleaned_book_data))
print(len(cleaned_book_data[0]))

65118
9


There are 76600 books after processing.

# Model
This section bulids a model that predicts average rating.

# Model(Bert)

In [62]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
import torch.optim as optim
from tqdm import tqdm
import random
from sklearn.model_selection import train_test_split
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Hyperparameters
criterion = nn.MSELoss()
MAX_LEN = 30
BATCH_SIZE = 16
EPOCHS = 2
LEARNING_RATE = 0.000013
FROZEN = False
SEED = 304

torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

# Dataset Class
class BookDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        title = item['title']
        rating = float(item['average_rating'])
        encoding = self.tokenizer(
            title,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'rating': torch.tensor(rating, dtype=torch.float)
        }

class BertForRegression(nn.Module):
    def __init__(self, pretrained_model_name='bert-base-uncased'):
        super(BertForRegression, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)  # Single regression output

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output  # Using the pooler output (embedding of the [CLS] token)
        regression_output = self.regressor(cls_output)
        return regression_output


# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForRegression(pretrained_model_name='bert-base-uncased').to(device)

# Prepare Dataset and DataLoader
dataset = BookDataset(cleaned_book_data, tokenizer, MAX_LEN)

# Split into train, validation, and test datasets
train_data, val_test_data = train_test_split(dataset.data, test_size=0.3, random_state=SEED)
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=SEED)

# Create datasets and dataloaders for train, validation, and test sets
train_dataset = BookDataset(train_data, tokenizer, MAX_LEN)
val_dataset = BookDataset(val_data, tokenizer, MAX_LEN)
test_dataset = BookDataset(test_data, tokenizer, MAX_LEN)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

if FROZEN:
  for name, param in model.named_parameters():
    # Only compute gradients for parameters of our
    # newly added regressor. BERT will not be trained.
    if 'regressor' not in name:
      param.requires_grad = False

# Training Loop
for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0

    with tqdm(train_dataloader, unit="batch") as tepoch:
        tepoch.set_description(f"Epoch {epoch+1}/{EPOCHS}")

        for batch in tepoch:
            # Move input tensors to the correct device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            ratings = batch['rating'].to(device)

            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = outputs.squeeze(-1)  # Flatten the output to match ratings shape

            # Calculate loss
            loss = F.mse_loss(predictions, ratings)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

            # Update progress bar description with the current loss
            tepoch.set_postfix(loss=loss.item())

    avg_train_loss = total_train_loss / len(train_dataloader)

    # Validation Loop
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        with tqdm(val_dataloader, unit="batch") as vepoch:
            vepoch.set_description(f"Validation {epoch+1}/{EPOCHS}")

            for batch in vepoch:
                # Move input tensors to the correct device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                ratings = batch['rating'].to(device)

                # Forward pass
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                predictions = outputs.squeeze(-1)  # Flatten the output to match ratings shape

                # Calculate loss
                loss = F.mse_loss(predictions, ratings)
                total_val_loss += loss.item()

                # Update progress bar description with the current loss
                vepoch.set_postfix(val_loss=loss.item())

    avg_val_loss = total_val_loss / len(val_dataloader)

    print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

print("Training complete.")

Using device: cuda


Epoch 1/2: 100%|██████████| 2849/2849 [05:18<00:00,  8.93batch/s, loss=0.216]
Validation 1/2: 100%|██████████| 611/611 [00:20<00:00, 29.87batch/s, val_loss=0.203]


Epoch 1/2, Train Loss: 0.1970, Validation Loss: 0.1280


Epoch 2/2: 100%|██████████| 2849/2849 [05:24<00:00,  8.79batch/s, loss=0.0833]
Validation 2/2: 100%|██████████| 611/611 [00:20<00:00, 29.59batch/s, val_loss=0.185]

Epoch 2/2, Train Loss: 0.1236, Validation Loss: 0.1219
Training complete.





In [63]:
from sklearn.metrics import r2_score

# Evaluation on Test Data with Accuracy and R² Calculation
model.eval()
total_test_loss = 0
correct_predictions = 0
total_predictions = 0
accuracy_margin = 0.25
true_ratings = []
predicted_ratings = []

with torch.no_grad():
    with tqdm(test_dataloader, unit="batch") as tepoch:
        tepoch.set_description("Evaluating Test Set")
        for batch in tepoch:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            ratings = batch['rating'].to(device)

            # Forward pass
            predictions = model(input_ids=input_ids, attention_mask=attention_mask).squeeze(-1)

            # Compute the loss
            loss = criterion(predictions, ratings)
            total_test_loss += loss.item()

            # Calculate accuracy: check if prediction is within 0.25 of the actual rating
            correct_predictions_batch = torch.abs(predictions - ratings) <= accuracy_margin
            correct_predictions += correct_predictions_batch.sum().item()
            total_predictions += len(ratings)

            # Store true and predicted ratings for R² computation
            true_ratings.extend(ratings.cpu().numpy())
            predicted_ratings.extend(predictions.cpu().numpy())

            tepoch.set_postfix(test_loss=loss.item(), accuracy=(correct_predictions / total_predictions) * 100)

# Compute final test accuracy
accuracy = (correct_predictions / total_predictions) * 100
avg_test_loss = total_test_loss / len(test_dataloader)

# Compute R² score
r2 = r2_score(true_ratings, predicted_ratings)

print(f"Test Loss: {avg_test_loss:.4f}")
print(f"Test Accuracy (within 0.25 of target): {accuracy:.2f}%")
print(f"R² Score: {r2:.4f}")


Evaluating Test Set: 100%|██████████| 611/611 [00:20<00:00, 29.99batch/s, accuracy=58, test_loss=0.0602]

Test Loss: 0.1271
Test Accuracy (within 0.25 of target): 57.95%
R² Score: 0.0794





In [64]:
# Save the model's state_dict to a file in Colab's local environment
torch.save(model.state_dict(), '/content/book_rating.pth')

# Optionally, save the optimizer's state as well
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
}, '/content/checkpoint.pth')

print("Model saved to /content/")

from google.colab import files

# Download the model
files.download('/content/book_rating.pth')

Model saved to /content/


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Example For a Tokenized Input

In [31]:
# Example For token
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

df = pd.DataFrame(cleaned_book_data)

inputs = df.title.values

tokenized_inputs = bert_tokenizer(
    inputs.tolist(),          # Input text
    add_special_tokens=True,  # add '[CLS]' and '[SEP]'
    padding='max_length',
    truncation=True, # pad to a length specified by the max_length
    max_length=MAX_LEN,       # truncate all sentences longer than max_length
    return_tensors='pt',      # return everything we need as PyTorch tensors
)

input_ids = tokenized_inputs['input_ids']
attention_masks = tokenized_inputs['attention_mask']

In [52]:
req = 23

# Print sentence 0, now as a list of IDs.
print('Original: ', tokenized_inputs['input_ids'][req])
print('* Token IDs:', tokenized_inputs['attention_mask'][req])
print('* Tokenized:', bert_tokenizer.decode(tokenized_inputs['input_ids'][req]))
print('* Attention_mask', tokenized_inputs['attention_mask'][req])

# Get the token IDs and decode them back to tokens
input_ids = tokenized_inputs['input_ids'][req].squeeze(0)  # Get the tensor and remove batch dimension
tokens = bert_tokenizer.convert_ids_to_tokens(input_ids)  # Convert IDs back to tokens

print(tokens)

Original:  tensor([  101,  9999,  2395, 24115,  1024,  3500,  7292, 28766,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
* Token IDs: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])
* Tokenized: [CLS] bradford street buddies : springtime blossoms [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
* Attention_mask tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])
['[CLS]', 'bradford', 'street', 'buddies', ':', 'spring', '##time', 'blossoms', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


### Some Testing and Prediction On the Model

In [67]:
# Function to evaluate a dataset
def evaluate_model(dataloader, model, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    ground_truth = []

    with torch.no_grad():
        with tqdm(dataloader, unit="batch") as tepoch:
            for batch in tepoch:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                ratings = batch['rating'].to(device)

                # Directly access the output tensor
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                predicted_ratings = outputs.squeeze(-1)  # Remove extra dimension

                loss = criterion(predicted_ratings, ratings)
                total_loss += loss.item()

                predictions.extend(predicted_ratings.cpu().numpy())
                ground_truth.extend(ratings.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    return avg_loss, predictions, ground_truth


In [110]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForRegression(pretrained_model_name='bert-base-uncased').to(device)

# Prepare Dataset and DataLoader
dataset = BookDataset(cleaned_book_data, tokenizer, MAX_LEN)

# Split into train, validation, and test datasets
train_data, val_test_data = train_test_split(dataset.data, test_size=0.3, random_state=SEED)
val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=SEED)

# Create datasets and dataloaders for train, validation, and test sets
train_dataset = BookDataset(train_data, tokenizer, MAX_LEN)
val_dataset = BookDataset(val_data, tokenizer, MAX_LEN)
test_dataset = BookDataset(test_data, tokenizer, MAX_LEN)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [112]:
import copy

# Modify the test dataset by adding "!" to each title
modified_test_data = copy.deepcopy(test_data)

for item in modified_test_data:
    item['title'] = item['title'] + "!"

# Create a new dataset for the modified test data
modified_test_dataset = BookDataset(modified_test_data, tokenizer, MAX_LEN)
modified_test_dataloader = DataLoader(modified_test_dataset, batch_size=BATCH_SIZE)

# Function to evaluate a dataset

# Evaluate the original test dataset
original_test_loss, original_predictions, original_ground_truth = evaluate_model(test_dataloader, model, criterion, device)

# Evaluate the modified test dataset (with "!" added to the titles)
modified_test_loss, modified_predictions, modified_ground_truth = evaluate_model(modified_test_dataloader, model, criterion, device)


# Calculate Mean Difference between original and modified predictions
original_predictions = np.array(original_predictions)
modified_predictions = np.array(modified_predictions)

print(np.mean((original_predictions - modified_predictions)))


100%|██████████| 611/611 [00:21<00:00, 28.78batch/s]
100%|██████████| 611/611 [00:20<00:00, 29.82batch/s]

-0.033934467





In [130]:
# Now we remove "the"
copy_test_data = copy.deepcopy(test_data)
unmodified = []
modified_test_data = []

for item in copy_test_data:
      if "the " in item['title']:
        unmodified.append(copy.deepcopy(item))
        item['title'] = item['title'].replace("the ", '')
        modified_test_data.append(item)

for item in copy_test_data:
      if "The " in item['title']:
        unmodified.append(copy.deepcopy(item))
        item['title'] = item['title'].replace("The ", '')
        modified_test_data.append(item)

# Create a new dataset for the modified test data
modified_test_dataset = BookDataset(modified_test_data, tokenizer, MAX_LEN)
modified_test_dataloader = DataLoader(modified_test_dataset, batch_size=BATCH_SIZE)

unmodified_test_dataset = BookDataset(unmodified, tokenizer, MAX_LEN)
unmodified_test_dataloader = DataLoader(unmodified_test_dataset, batch_size=BATCH_SIZE)

# Function to evaluate a dataset

# Evaluate the original test dataset
original_test_loss, original_predictions, original_ground_truth = evaluate_model(unmodified_test_dataloader, model, criterion, device)


modified_test_loss, modified_predictions, modified_ground_truth = evaluate_model(modified_test_dataloader, model, criterion, device)



# Calculate Mean Squared Error between original and modified predictions
original_predictions = np.array(original_predictions)
modified_predictions = np.array(modified_predictions)

print(np.mean((original_predictions - modified_predictions)))
print(len(unmodified))

100%|██████████| 252/252 [00:09<00:00, 27.24batch/s]
100%|██████████| 252/252 [00:07<00:00, 32.55batch/s]

-0.011697753
4028





In [131]:
print(modified_test_data)



In [132]:
# Now we replace period with !
copy_test_data = copy.deepcopy(test_data)
unmodified = []
modified_test_data = []

for item in copy_test_data:
      if "." in item['title']:
        unmodified.append(copy.deepcopy(item))
        item['title'] = item['title'].replace(".", '!')
        modified_test_data.append(item)


# Create a new dataset for the modified test data
modified_test_dataset = BookDataset(modified_test_data, tokenizer, MAX_LEN)
modified_test_dataloader = DataLoader(modified_test_dataset, batch_size=BATCH_SIZE)

unmodified_test_dataset = BookDataset(unmodified, tokenizer, MAX_LEN)
unmodified_test_dataloader = DataLoader(unmodified_test_dataset, batch_size=BATCH_SIZE)

# Function to evaluate a dataset

# Evaluate the original test dataset
original_test_loss, original_predictions, original_ground_truth = evaluate_model(unmodified_test_dataloader, model, criterion, device)


modified_test_loss, modified_predictions, modified_ground_truth = evaluate_model(modified_test_dataloader, model, criterion, device)



# Calculate Mean Squared Error between original and modified predictions
original_predictions = np.array(original_predictions)
modified_predictions = np.array(modified_predictions)

print(np.mean((original_predictions - modified_predictions)))
print(len(unmodified))

100%|██████████| 18/18 [00:01<00:00, 17.59batch/s]
100%|██████████| 18/18 [00:00<00:00, 25.26batch/s]

0.02613376
286





In [125]:
print(modified_test_data)

[{'country_code': 'US', 'title': 'Ullucyate Bug Book', 'num_pages': 10, 'publication_year': 1993, 'average_rating': 4.0, 'text_reviews_count': 1, 'publication_month': 10, 'ratings_count': 2, 'publication_day': 15}, {'country_code': 'US', 'title': "The Story of Princess Olivia: Wherein an Oplucyistic Slip of a Girl Brings Sunshine Into the Lives of Her Royal Parents, the Whiny King and the Scolding Queen, and Outsmarts the Despicable Count Carlos Maximillion Von Dusseldorf (with Two S's) and His M...", 'num_pages': 143, 'publication_year': 2013, 'average_rating': 3.3, 'text_reviews_count': 5, 'publication_month': 10, 'ratings_count': 15, 'publication_day': 1}, {'country_code': 'US', 'title': "Humphrey's Bedlucye", 'num_pages': 32, 'publication_year': 2001, 'average_rating': 3.92, 'text_reviews_count': 12, 'publication_month': 10, 'ratings_count': 61, 'publication_day': 1}, {'country_code': 'US', 'title': 'Slucyp!', 'num_pages': 32, 'publication_year': 2011, 'average_rating': 3.85, 'text

In [None]:
# Now we replace period with !
copy_test_data = copy.deepcopy(test_data)
unmodified = []
modified_test_data = []

for item in copy_test_data:
      if "." in item['title']:
        unmodified.append(copy.deepcopy(item))
        item['title'] = item['title'].replace(".", '!')
        modified_test_data.append(item)


# Create a new dataset for the modified test data
modified_test_dataset = BookDataset(modified_test_data, tokenizer, MAX_LEN)
modified_test_dataloader = DataLoader(modified_test_dataset, batch_size=BATCH_SIZE)

unmodified_test_dataset = BookDataset(unmodified, tokenizer, MAX_LEN)
unmodified_test_dataloader = DataLoader(unmodified_test_dataset, batch_size=BATCH_SIZE)

# Function to evaluate a dataset

# Evaluate the original test dataset
original_test_loss, original_predictions, original_ground_truth = evaluate_model(unmodified_test_dataloader, model, criterion, device)


modified_test_loss, modified_predictions, modified_ground_truth = evaluate_model(modified_test_dataloader, model, criterion, device)



# Calculate Mean Squared Error between original and modified predictions
original_predictions = np.array(original_predictions)
modified_predictions = np.array(modified_predictions)

print(np.mean((original_predictions - modified_predictions)))
print(len(unmodified))