In [1]:
# Part 1: Store the dataset and subsample it
import pandas as pd
chunk_array = []
with pd.read_json("../yelp_academic_dataset_review.json", orient="records", lines=True, chunksize=40000) as reader:
    for chunk in reader:
        chunk_array.append(chunk)
raw_dataset = chunk_array[0]

In [2]:
print(raw_dataset)

                    review_id                 user_id             business_id  \
0      KU_O5udG6zpxOg-VcAEodg  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw   
1      BiTunyQ73aT9WBnpR9DZGw  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ   
2      saUsX_uimxRlCVr67Z4Jig  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A   
3      AqPFMleE6RsU23_auESxiA  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA   
4      Sx8TMOWLNuJBWer-0pcmoA  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ   
...                       ...                     ...                     ...   
39995  EYkouQA9oWuiWDNZsYl7aA  WfiilB5OXV7vSmHP-80n-A  HQ-C47_Xi5it1KzwEc0u0A   
39996  xZi6gbagKAzqCKjtDLrhGQ  BDEi5eV-uhP4A4atMNzW5w  ena3aLdMz2ym_OPVuTIJ2g   
39997  bxEjtoD74xPBJnMtV2759A  GLCcS7HGPa7MD997xq5W9w  34Eqv8jXgxg_EEwcsNgeeg   
39998  xT8DOnqIu_7N-9AnkFftaQ  bNnBwW5kNO77KTgMeVhxKg  F2C5ENuY8CXfgoW-gAMdDA   
39999  Wy7Njv1S0SaLEk9Bj-ZHPw  ZVREpaL2TPWMtUDJaUZulg  ORL4JE6tz3rJxVqkdKfegA   

       stars  useful  funny

In [3]:
# Part 2: Split dataset into train, validation, and test sets
# Manually select portion of dataset
train_size = 20000
val_size = 30000

# Manually slice dataset
train_df = raw_dataset.iloc[:train_size]
val_df = raw_dataset.iloc[train_size:val_size]
test_df = raw_dataset.iloc[val_size:]

In [5]:
# Part 3: Tokenize the dataset
from transformers import BertTokenizer

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

# Tokenize review texts
train_texts = train_df['text'].tolist()
val_texts = val_df['text'].tolist()
test_texts = test_df['text'].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Part 4: Extract labels
import torch

columns = ['stars', 'useful', 'cool', 'funny']

train_labels = torch.tensor(train_df[columns].values)
val_labels = torch.tensor(val_df[columns].values)
test_labels = torch.tensor(test_df[columns].values)

In [7]:
# Part 5: Prepare dataset for PyTorch model
from torch.utils.data import Dataset, DataLoader, Subset

# Create custom Dataset class
class YelpReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
        
    def __len__(self):
        return len(self.labels)

# Create dataset objects for training, validation, and testing
train_dataset = YelpReviewDataset(train_encodings, train_labels)
val_dataset = YelpReviewDataset(val_encodings, val_labels)
test_dataset = YelpReviewDataset(test_encodings, test_labels)

# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=6, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=6)
test_loader = DataLoader(test_dataset, batch_size=6)

In [8]:
# Part 4: Load pre-trained BERT model
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Part 5: Train the model
from torch.optim import AdamW
from sklearn.metrics import mean_squared_error
import numpy as np

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Define loss function (Mean Squared Error for regression)
loss_fn = torch.nn.MSELoss()

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def train_epoch(model, data_loader, loss_fn, optimizer):
    model.train()
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        logits = outputs.logits

        # Compute loss
        loss = loss_fn(logits, labels.float())
        total_loss += loss.item()

        # Backpropagation
        loss.backward()
        optimizer.step()

    
    return total_loss / len(data_loader)

def evaluate(model, data_loader):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            logits = outputs.logits

            preds.append(logits.cpu().numpy())
            true_labels.append(labels.cpu().numpy())
    preds = np.concatenate(preds, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)

    # Evaluate using mean squared error
    mse = mean_squared_error(true_labels, preds)
    return mse

# Run training and evaluation for multiple epochs
for epoch in range(2): # Adjust the number of epochs as necessary
    train_loss = train_epoch(model, train_loader, loss_fn, optimizer)
    print(f"Epoch {epoch + 1} - Training loss: {train_loss}")

    mse = evaluate(model, val_loader)
    print(f"Epoch {epoch + 1} - Validation mse: {mse}")

Epoch 1 - Training loss: 1.1920781727962924
Epoch 1 - Validation mse: 1.2692443608091875
Epoch 2 - Training loss: 0.9796683279664469


In [10]:
# Part 6: Predict the test set
def predict_on_test(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            logits = outputs.logits

            all_preds.append(logits.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            
    # Convert list of predictions and labels to numpy arrays
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    return all_preds, all_labels

# Get predictions on the test set
test_preds, test_labels = predict_on_test(model, test_loader)

mse = mean_squared_error(test_labels, test_preds)
print(f'Mean Squared Error on the test set: {mse}')

Mean Squared Error on the test set: 1.4481527309236755


In [11]:
for i in range(5):
    print(f"Review: {raw_dataset['text'].iloc[30000 + i]}")
    print(f"True labels: {test_labels[i]}")
    print(f"Predicted labels: {test_preds[i]}")
    print("-" * 50)

Review: This place is fantastic. I was apprehensive (as I often am walking into bike shops) when I walked in, but quickly realized that this place was different. There were some hard core road bikers in the back talking in their own language, a few weekend warriors getting their bikes worked on and getting gear, and me: an aspiring cyclist intimidated by the prices, lingo, and other details of taking up this pastime.  I normally cringe when I walk into bike stores and feel that if I don't drop 4 grand quickly on a carbon fiber bike or show them the yellow jersey I wore when leading three stages of the Tour de France, that I'll quickly be shown the exit (or a brusque cold shoulder).

At this place, a guy came up and asked what I was looking for and how he could help. I explained my situation: I have a road bike, I want to start using it as my primary means of transportation and for recreation, and I want to get the rest of the equipment that I need. He quickly explained what equipment i

In [12]:
# Part 7: Setup dataset for experiment 2
exp_2_df = chunk_array[1]
exp_2_df = exp_2_df.loc[exp_2_df['stars'] != 1]
exp_2_df = exp_2_df.iloc[:10000]
print(exp_2_df)

                    review_id                 user_id             business_id  \
40000  zwu3HkuLQW0udgNb43e-dw  CigyryBCd5GFc01FnXvgcQ  u7_3L1NBWgxhBM_B-cmmnA   
40001  2TkFR7TG8TUzfCMP1oiwJQ  PWVL8fYmjBHqoyjt6LIcpA  vN6v8m4DO45Z4pp8yxxF_w   
40003  clSTQLuLu11UhwSPKEtToA  pTw5mqWvjJzbe23GTFnwSg  ltBBYdNzkeKdCNPDAsxwAA   
40004  kBuiwIzGu_4G3VXNfH3Bow  w4he3nb2wQI5h5rTbCaqXA  OINbC0rpDVJ5bfxt3LO9fw   
40007  xo5Qxcq-8Xi9m9NmehXBlw  2iAk0uFpg0aAyA2C1M0f7g  B6Lvq3sOYwhKxPzadDka9g   
...                       ...                     ...                     ...   
51216  xuZdKY4G2mjZL6E8Oej30w  A7io7WGZA1p0Q3_Mp9vGdw  2pXrwBssKTp30uxbQgl6kw   
51217  459NXCRcGRprImr9gYQO7A  9-FkHuGRbzfUjqPGbNv3Jg  kd0Sv_ZjnBhbMxs0U4KgQw   
51220  kv9FV-etjnysLXLUazOoGw  QEMfgXbtAU-gHIDyfocB-w  P8X2NUCEQm-YPTS3em1Kgg   
51221  XDrklSZWRW6qqu8ydTlXgA  JzQ4fIrplO-o6sjIgy1EsA  t1qF12NdW8KvCqxqbvy-Hg   
51222  OD6FA4WdW845bVrQTAwIyA  bz8HMDpRYdDmzqAKepbW5A  6bFx8j2KqPfmT0XEUS1RMg   

       stars  useful  funny

In [13]:
# Part 8: Prepare dataset
exp_2_texts = exp_2_df['text'].to_list()
exp_2_encodings = tokenizer(exp_2_texts, truncation=True, padding=True, max_length=512)
exp_2_labels = torch.tensor(exp_2_df[columns].values)
exp_2_dataset = YelpReviewDataset(exp_2_encodings, exp_2_labels)
exp_2_loader = DataLoader(exp_2_dataset, batch_size = 6)

In [14]:
# Part 9: Get predictions on the experiment 2 set
exp_2_preds, exp_2_labels = predict_on_test(model, exp_2_loader)

mse = mean_squared_error(exp_2_labels, exp_2_preds)
print(f'Mean Squared Error on the exp_2 set: {mse}')

Mean Squared Error on the exp_2 set: 1.458718695466033


In [15]:
# Part 10: Setup dataset for experiment 3
count_text = []
from collections import Counter
exp_3_df = chunk_array[2]
exp_3_texts = exp_3_df['text'].to_list()
# for text in exp_3_texts:
#     count_text = count_text + text.split()
# counter = Counter(count_text)
# most_common = counter.most_common(1)
# print(most_common)

In [16]:
exp_3_df = exp_3_df[exp_3_df['text'].str.contains("the", case=True) == False]
print(exp_3_df)

                     review_id                 user_id  \
80006   2TtzJxP31jOa1ihySAWwwQ  skXKKYvpaBAnJ2vH0OytVw   
80008   74Rx2JEsXELR5QmwvDzThQ  Rvfa3lFgK0W91AvfLrGhsg   
80015   r0HqL29dDeNSP6KCJ5LJJg  8J2CZaOR1mt2UPZzoOlIGg   
80022   RLB0K2WviFdK9dbNk4_ewg  EiiAascIEQmYcWI6BQe0tg   
80026   wD4iajcnFGh0F--xsf7eeA  TRzbT9POo8TswwyTigIQDQ   
...                        ...                     ...   
119956  gGcu1WJ3B7iK2kH2q7C_Fg  mJatJshTyF_hpWgYd2dY3A   
119959  _F-VXZc8L_TPccBs0S_kaQ  TZwKmfmbzevOtOoZhlZVxA   
119971  zgC0VCvpBfJzXCVZCRuRbw  BCpRqfCzNr4rJy00t3OOCw   
119982  fmKgMjWGpSFKKDl5ZqoKsw  3MiBisjFH9mNKUgzA0x_RA   
119999  IJzJrH-WrBsyVJzveIKOVw  28H58l9RTUCxRntOTwe0aQ   

                   business_id  stars  useful  funny  cool  \
80006   ugSj0rWlWQ57-FuKuieGXA      2       0      0     0   
80008   JvawJ9bSr22xn4R9oLvl_w      5       0      0     0   
80015   -wB5H63ERJ9S0oCp_ULR0Q      5       1      0     0   
80022   -mq1DwgcLU96PQbmcv3jRQ      4       1      0   

In [17]:
exp_3_texts = exp_3_df['text'].to_list()
exp_3_encodings = tokenizer(exp_3_texts, truncation=True, padding=True, max_length=512)
exp_3_labels = torch.tensor(exp_3_df[columns].values)
exp_3_dataset = YelpReviewDataset(exp_3_encodings, exp_3_labels)
exp_3_loader = DataLoader(exp_3_dataset, batch_size = 6)

In [18]:
exp_3_preds, exp_3_labels = predict_on_test(model, exp_3_loader)

mse = mean_squared_error(exp_3_labels, exp_3_preds)
print(f'Mean Squared Error on the exp_3 set: {mse}')

Mean Squared Error on the exp_3 set: 0.668613642903321


In [19]:
import pickle
with open('BERT.pkl', 'wb') as file:
    pickle.dump(model, file)

In [4]:
test_df.to_json("bert_test_set.json", orient="index")