# Developing a sentiment classifier by fine-tuning the pretrained BERT-base model

In [1]:
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Specify gpu as torch device if available

In [2]:
import torch

# If there's a GPU available
if torch.cuda.is_available():
  device = torch.device("cuda")
  print('Using GPU:', torch.cuda.get_device_name(0))
else:
  device = torch.device("cpu")
  print('Using CPU')

Using GPU: Tesla T4


# Store reviews.csv file to pandas dataframe

In [3]:
df = pd.read_csv("/content/gdrive/MyDrive/Artificial_Intelligence2/imdb-reviews.csv", sep='\t')
print(df)

                                                    url  rating  \
0      http://www.imdb.com/title/tt0120623/usercomments    10.0   
1      http://www.imdb.com/title/tt0043117/usercomments     9.0   
2      http://www.imdb.com/title/tt0043117/usercomments    10.0   
3      http://www.imdb.com/title/tt0835204/usercomments     4.0   
4      http://www.imdb.com/title/tt0499603/usercomments    10.0   
...                                                 ...     ...   
45003  http://www.imdb.com/title/tt0449000/usercomments     1.0   
45004  http://www.imdb.com/title/tt0109382/usercomments     1.0   
45005  http://www.imdb.com/title/tt0375560/usercomments     1.0   
45006  http://www.imdb.com/title/tt0165107/usercomments     1.0   
45007  http://www.imdb.com/title/tt0041513/usercomments    10.0   

                                                  review  
0      I thought this was a quiet good movie. It was ...  
1      Wagon Master is a very unique film amongst Joh...  
2      This film h

# Preprocess/cleanse data


In [4]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

def preprocess(df):
  df = df.drop('url', axis = 1)
  df['review'] = df['review'].str.lower()
  df['review'] = df['review'].str.replace("<br />", " ")
  df['review'] = df['review'].str.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)    # Remove urls
  df['review'] = df['review'].str.replace('@[A-Za-z0-9_]+','')                                              # Remove mentions
  df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))                  # Remove emojis
  df['review'] = df['review'].str.replace('[^\w\s]','')                                                     # Remove punctuation
  df['review'] = df['review'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))   # Remove stop words
  return df

df1 = preprocess(df)
print(df1)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df['review'] = df['review'].str.replace('@[A-Za-z0-9_]+','')                                              # Remove mentions
  df['review'] = df['review'].str.replace('[^\w\s]','')                                                     # Remove punctuation


      rating                                             review
0       10.0  thought quiet good movie fun watch liked best ...
1        9.0  wagon master unique film amongst john fords wo...
2       10.0  film near perfect film john ford made film mag...
3        4.0  gave 4 stars lot interesting themes many alrea...
4       10.0  movie really genuine random really hard find m...
...      ...                                                ...
45003    1.0  dont even know begin worth typing review quote...
45004    1.0  one worst movies saw 90s id often use benchmar...
45005    1.0  baldwin really stooped low make movies script ...
45006    1.0  liked watching mel gibson million dollar hotel...
45007   10.0  easily best cinematic version william faulkner...

[45008 rows x 2 columns]


In [5]:
most_used_words = pd.Series(' '.join(df1['review']).split()).value_counts()[:5]
print(most_used_words)

movie    79033
film     69432
one      46646
like     35623
good     26234
dtype: int64


# Hold the reviews in dataframe X and the sentiment value (0 or 1) in dataframe Y

Also replacing the review rating with sentiment 1 for positive, 0 for negative (>=7 to 1, <=4 to 0)

In [6]:
X = df1.drop('rating', axis=1)
X = X.squeeze()
Y = df1[['rating']]                 # only keep the rating
Y = Y.replace(['7.0', '8.0', '9.0', '10.0'], 1)
Y = Y.replace(['0.0', '1.0', '2.0', '3.0', '4.0'], 0)

print(X)
print(Y)

0        thought quiet good movie fun watch liked best ...
1        wagon master unique film amongst john fords wo...
2        film near perfect film john ford made film mag...
3        gave 4 stars lot interesting themes many alrea...
4        movie really genuine random really hard find m...
                               ...                        
45003    dont even know begin worth typing review quote...
45004    one worst movies saw 90s id often use benchmar...
45005    baldwin really stooped low make movies script ...
45006    liked watching mel gibson million dollar hotel...
45007    easily best cinematic version william faulkner...
Name: review, Length: 45008, dtype: object
       rating
0           1
1           1
2           1
3           0
4           1
...       ...
45003       0
45004       0
45005       0
45006       0
45007       1

[45008 rows x 1 columns]


In [7]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Import and load tokenizer

In [8]:
from transformers import BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize all reviews and map the tokens to their word IDs

In [9]:
X_mapped = []
att_masks = []

for review in X:
  encoded_dict = tokenizer.encode_plus(
                      review,
                      add_special_tokens = True,          # Add [CLS] and [SEP] tokens
                      padding = 'max_length',             # Pad and truncate all reviewss
                      truncation = True,
                      max_length = 128,                   # to max_length (Tried 64 and 128)
                      return_attention_mask = True,       # Construct attention masks
                      return_tensors = 'pt',              # Return pytorch tensors
                      )
  
  # Append encoded review to list   
  X_mapped.append(encoded_dict['input_ids'])
  # Append its attention mask to list
  att_masks.append(encoded_dict['attention_mask'])

# Convert to tensors
X_mapped = torch.cat(X_mapped, dim=0)
att_masks = torch.cat(att_masks, dim=0)
Y = torch.tensor(Y.values)

# Check that we now have mapped ids for each word from all reviews stored in list X_mapped (by printing the second review before and after encode_plus)
print('Review after preprocessing:', X[1])
print('Mapped token IDs:', X_mapped[1])
print('Sentiment (Pos = 1, Neg = 0):', Y[1])

Review after preprocessing: wagon master unique film amongst john fords work mainly one based story written john ford story elaborated frank nugent directors son patrick ford turned screenplay directors personal opinion regarding wagon master film john ford called one came closest wanted achieve say say little ford confessed lindsay anderson favourite nonetheless darling clementine wagon master ingredients one might expect find john fords film wonderful cast delivering best thou featuring major stars except fordian actors ben johnson peculiar small characters provide obligatory comic relief wagon master quite horn blowing sister ledyard jane darwell shot inspired gigs last least legendary monument valley john fords fifth passage stagecoach darling clementine fort apache wore yellow ribbon film starts two friends cowboys travis blue ben johnson sandy owens harry carey jr hired wagon masters guides caravan mormon settlers headed silver valley place thats like promised land way joined pec

# Divide dataset to a 85-15 split, so that we now have 38,2k mapped reviews for training and 6,8k mapped reviews for validation 

In [10]:
from torch.utils.data import TensorDataset, random_split

# Combine training inputs into a TensorDataset
dataset = TensorDataset(X_mapped, att_masks, Y)

train_size = int(0.85 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print("Samples for training:", train_size)
print("Samples for validation:", val_size)

Samples for training: 38256
Samples for validation: 6752


# Creating DataLoaders for our training and validation sets

In [11]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# For fine-tuning BERT on a downstream task, documentation recommends a batch size of 16 or 32
batch_size = 32

# Using random sampler to take batches in random order
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)
# For validation batch order doesn't matter, so we take them sequentially
validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = batch_size)

# Preparation for training: choosing bert model, number of epochs and optimizer

In [12]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Using BertForSequenceClassification, which is the pretrained BERT model with one linear classification layer on top
# Setting the number of output layers: 2 for binary classification.
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)

# Running the model on the gpu
if torch.cuda.is_available():
  model.cuda()

# Tried recommended 2, 3 and 4 epochs
# However my model seems to overfit for 3 or more epochs, so:
epochs = 2

# Tried recommended learning rates: 5e-5, 3e-5, 2e-5
optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Train the model and then evaluate it after each epoch using f1 score, precision and recall

Also printing total loss for each epoch on the training set and on the validation set, to easily observe cases of overfitting

In [13]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
import time
import datetime

# Function that takes a time in seconds and returns a string hh:mm:ss
def format_time(elapsed):
  elapsed_rounded = int(round((elapsed)))
  return str(datetime.timedelta(seconds=elapsed_rounded))

# Measure the total training time for the whole run.
total_time = time.time()

for ep in range(0, epochs):
  print("\nEpoch", ep + 1, ". . .")

  t = time.time()
  total_train_loss = 0

  # ------- TRAINING ---------
  model.train()
  for step, batch in enumerate(train_dataloader):

    # Print progress every 200 batches
    if step % 200 == 0 and step != 0:

      elapsed_t = format_time(time.time() - t)
      print('  Batch {:>4,}  of  {:>4,} -- Time passed: {:}.'.format(step, len(train_dataloader), elapsed_t))

    # Unpack this training batch from our dataloader and copy each tensor to the GPU
    batch_X_mapped = batch[0].to(device)
    batch_input_mask = batch[1].to(device)
    batch_labels = batch[2].to(device)

    model.zero_grad()

    output = model(batch_X_mapped, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)
    loss = output[0]
    logits = output[1]

    total_train_loss += loss.item()
        
    # backpropagation - compute gradients 
    loss.backward()

    # Clip gradient norm to 1.0
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # apply gradients 
    optimizer.step()

  # Calculate the average loss of all batches for this epoch
  avg_train_loss = total_train_loss / len(train_dataloader)
  
  training_time = format_time(time.time() - t)
  print("\n  Training time for this epoch: {:}".format(training_time))
  print("  Average Training Loss: {0:.2f}".format(avg_train_loss))

  # ------- EVALUATION ---------
  # Put model in evaluation mode
  model.eval()
  total_eval_loss = 0

  predictions , actual_labels = [], []

  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    batch_X_mapped, batch_input_mask, batch_labels = batch
    
    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
    with torch.no_grad():
        output = model(batch_X_mapped, token_type_ids=None, attention_mask=batch_input_mask, labels = batch_labels)
        
    loss = output[0]
    logits = output[1]
    total_eval_loss += loss.item()

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    batch_labels = batch_labels.to('cpu').numpy()
    
    # Store predictions and actual labels
    predictions.append(logits)
    actual_labels.append(batch_labels)

  # Flatten predictions and actual labels from list of batches to 1 list
  flat_labels, flat_pred = [], []
  for lab in actual_labels:
    for prd in lab:
      flat_labels.append(prd)
  flat_pred = []
  for b_pr in predictions:
    for prd in b_pr:
      flat_pred.append(prd)

  # The predictions are a 2-column ndarray. Get index of highest value for each tuple in order to turn it into a list of 0s and 1s
  pred_labels = np.argmax(flat_pred, axis=1)
  # print(len(flat_labels))
  # print(len(pred_labels))

  # Calculate and print validation scores
  avg_val_loss = total_eval_loss / len(validation_dataloader)
  print("  Validation Loss: {0:.4f}".format(avg_val_loss))
  print("\n  VALIDATION SCORES:\n")
  f1_val = f1_score(flat_labels, pred_labels)
  print("  F1 SCORE: {0:.4f}".format(f1_val))
  pscore = precision_score(flat_labels, pred_labels)
  print("  PRECISION: {0:.4f}".format(pscore))
  rscore = recall_score(flat_labels, pred_labels)
  print("  RECALL: {0:.4f}".format(rscore))

print("\nTotal time {:}".format(format_time(time.time()-total_time)))



Epoch 1 . . .
  Batch  200  of  1,196 -- Time passed: 0:02:11.
  Batch  400  of  1,196 -- Time passed: 0:04:22.
  Batch  600  of  1,196 -- Time passed: 0:06:32.
  Batch  800  of  1,196 -- Time passed: 0:08:43.
  Batch 1,000  of  1,196 -- Time passed: 0:10:54.

  Training time for this epoch: 0:13:01
  Average Training Loss: 0.32
  Validation Loss: 0.2435

  VALIDATION SCORES:

  F1 SCORE: 0.9034
  PRECISION: 0.8921
  RECALL: 0.9151

Epoch 2 . . .
  Batch  200  of  1,196 -- Time passed: 0:02:11.
  Batch  400  of  1,196 -- Time passed: 0:04:21.
  Batch  600  of  1,196 -- Time passed: 0:06:32.
  Batch  800  of  1,196 -- Time passed: 0:08:43.
  Batch 1,000  of  1,196 -- Time passed: 0:10:53.

  Training time for this epoch: 0:13:01
  Average Training Loss: 0.19
  Validation Loss: 0.2600

  VALIDATION SCORES:

  F1 SCORE: 0.9068
  PRECISION: 0.8871
  RECALL: 0.9274

Total time 0:27:30
