In [15]:
# Part 1: Store the dataset and subsample it
import pandas as pd
chunk_array = []
with pd.read_json("../yelp_academic_dataset_review.json", orient="records", lines=True, chunksize=40000) as reader:
    for chunk in reader:
        chunk_array.append(chunk)
raw_dataset = chunk_array[0]

In [16]:
print(raw_dataset)

                    review_id                 user_id             business_id  \
0      KU_O5udG6zpxOg-VcAEodg  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw   
1      BiTunyQ73aT9WBnpR9DZGw  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ   
2      saUsX_uimxRlCVr67Z4Jig  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A   
3      AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
4      Sx8TMOWLNuJBWer-0pcmoA  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ   
...                       ...                     ...                     ...   
39995  EYkouQA9oWuiWDNZsYl7aA  WfiilB5OXV7vSmHP-80n-A  HQ-C47_Xi5it1KzwEc0u0A   
39996  xZi6gbagKAzqCKjtDLrhGQ  BDEi5eV-uhP4A4atMNzW5w  ena3aLdMz2ym_OPVuTIJ2g   
39997  bxEjtoD74xPBJnMtV2759A  GLCcS7HGPa7MD997xq5W9w  34Eqv8jXgxg_EEwcsNgeeg   
39998  xT8DOnqIu_7N-9AnkFftaQ  bNnBwW5kNO77KTgMeVhxKg  F2C5ENuY8CXfgoW-gAMdDA   
39999  Wy7Njv1S0SaLEk9Bj-ZHPw  ZVREpaL2TPWMtUDJaUZulg  ORL4JE6tz3rJxVqkdKfegA   

       stars  useful  funny

In [17]:
# Part 2: Split dataset into train, validation, and test sets
# Manually select portion of dataset
train_size = 20000
val_size = 30000

# Manually slice dataset
train_df = raw_dataset.iloc[:train_size]
val_df = raw_dataset.iloc[train_size:val_size]
test_df = raw_dataset.iloc[val_size:]

In [18]:
# Part 3: Tokenize the dataset
from transformers import BertTokenizer

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

# Tokenize review texts
train_texts = train_df['text'].tolist()
val_texts = val_df['text'].tolist()
test_texts = test_df['text'].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

In [19]:
# Part 4: Extract labels
import torch

columns = ['stars', 'useful', 'cool', 'funny']

train_labels = torch.tensor(train_df[columns].values)
val_labels = torch.tensor(val_df[columns].values)
test_labels = torch.tensor(test_df[columns].values)

In [20]:
# Part 5: Prepare dataset for PyTorch model
from torch.utils.data import Dataset, DataLoader, Subset

# Create custom Dataset class
class YelpReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
        
    def __len__(self):
        return len(self.labels)

# Create dataset objects for training, validation, and testing
train_dataset = YelpReviewDataset(train_encodings, train_labels)
val_dataset = YelpReviewDataset(val_encodings, val_labels)
test_dataset = YelpReviewDataset(test_encodings, test_labels)

# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [21]:
# Part 4: Load pre-trained BERT model
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Part 5: Train the model
from torch.optim import AdamW
from sklearn.metrics import mean_squared_error
import numpy as np

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Define loss function (Mean Squared Error for regression)
loss_fn = torch.nn.MSELoss()

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def train_epoch(model, data_loader, loss_fn, optimizer):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        logits = outputs.logits

        # Compute loss
        loss = loss_fn(logits, labels.float())
        total_loss += loss.item()

        # Backpropagation
        loss.backward()
        optimizer.step()

    
    return total_loss / len(data_loader)

def evaluate(mode, data_loader):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            logits = outputs.logits

            preds.append(logits.cpu().numpy())
            true_labels.append(labels.cpu().numpy())
    preds = np.concatenate(preds, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)

    # Evaluate using mean squared error
    mse = mean_squared_error(true_labels, preds)
    return mse

# Run training and evaluation for multiple epochs
for epoch in range(3): # Adjust the number of epochs as necessary
    train_loss = train_epoch(model, train_loader, loss_fn, optimizer)
    print(f"Epoch {epoch + 1} - Training loss: {train_loss}")

    mse = evaluate(model, val_loader)
    print(f"Epoch {epoch + 1} - Validation MSE: {mse}")

Epoch 1 - Training loss: 1.6062116950511933
Epoch 1 - Validation MSE: 1.252678357814985
Epoch 2 - Training loss: 1.3803184407711029
Epoch 2 - Validation MSE: 1.232054038737448
Epoch 3 - Training loss: 1.287151593375206
Epoch 3 - Validation MSE: 1.2381401882235157
Epoch 4 - Training loss: 1.1240907793283463
Epoch 4 - Validation MSE: 1.3301969455375364
Epoch 5 - Training loss: 0.9521248504936695
Epoch 5 - Validation MSE: 1.2597644419579592
Epoch 6 - Training loss: 0.8160863175392151
Epoch 6 - Validation MSE: 1.2916457599198472
Epoch 7 - Training loss: 0.7322426264882088
Epoch 7 - Validation MSE: 1.2670519048466398
Epoch 8 - Training loss: 0.6507497804820538
Epoch 8 - Validation MSE: 1.3040661101335322
Epoch 9 - Training loss: 0.5950958204627037
Epoch 9 - Validation MSE: 1.2808329162835115
Epoch 10 - Training loss: 0.552346251487732
Epoch 10 - Validation MSE: 1.3178448575156443


In [23]:
# Part 6: Predict the test set
def predict_on_test(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            logits = outputs.logits

            all_preds.append(logits.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            
    # Convert list of predictions and labels to numpy arrays
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    return all_preds, all_labels

# Get predictions on the test set
test_preds, test_labels = predict_on_test(model, test_loader)

mse = mean_squared_error(test_labels, test_preds)
print(f'Mean Squared Error on the test set: {mse}')

Mean Squared Error on the test set: 1.5290840172460103


In [24]:
for i in range(5):
    print(f"Review: {raw_dataset['text'].iloc[30000 + i]}")
    print(f"True labels: {test_labels[i]}")
    print(f"Predicted labels: {test_preds[i]}")
    print("-" * 50)

Review: This place is fantastic. I was apprehensive (as I often am walking into bike shops) when I walked in, but quickly realized that this place was different. There were some hard core road bikers in the back talking in their own language, a few weekend warriors getting their bikes worked on and getting gear, and me: an aspiring cyclist intimidated by the prices, lingo, and other details of taking up this pastime.  I normally cringe when I walk into bike stores and feel that if I don't drop 4 grand quickly on a carbon fiber bike or show them the yellow jersey I wore when leading three stages of the Tour de France, that I'll quickly be shown the exit (or a brusque cold shoulder).

At this place, a guy came up and asked what I was looking for and how he could help. I explained my situation: I have a road bike, I want to start using it as my primary means of transportation and for recreation, and I want to get the rest of the equipment that I need. He quickly explained what equipment i