# Model Trainer
### Finetunes an NLP Classifier on the COVID-19 Fake News dataset, saving the best-performing model

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from transformers import AdamW

from tqdm.notebook import tqdm, trange
import pandas as pd
import numpy as np

In [3]:
from data_utils import *
from model_utils import *

In [4]:
# File Constants
DATA_FOLDER = "compiled_data"
MODEL_FOLDER = "models"

TRAIN_DATASET_FILENAME = "combined_labeled_data_train.csv"
MODEL_OUTPUT_FILENAME = "BERT_limited.ckpt"


TRAIN_DATASET_PATH = f"{DATA_FOLDER}/{TRAIN_DATASET_FILENAME}"
MODEL_OUTPUT_PATH = f"{MODEL_FOLDER}/{MODEL_OUTPUT_FILENAME}"

In [10]:
# Training Constants
VAL_DATA_PROPORTION = 0.4
BATCH_SIZE = 32
EPOCHS = 15
LR = 2e-5
WEIGHT_DECAY = 1e-2
NO_DECAY_PARAMS = ['bias', 'gamma', 'beta']

In [11]:
# Collect training data
data = pd.read_csv(TRAIN_DATASET_PATH)

# Shuffle and split into train and val
data = data.sample(frac=1).reset_index(drop=True)
val_end_index = int(len(data) * VAL_DATA_PROPORTION)
val_data = data[:val_end_index]
train_data = data[val_end_index:]

In [14]:
# Load pretrained model
model = NLPHandler(model_savename=None)

# Optimizer Setup
param_optimizer = list(model.classifier.named_parameters())
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in NO_DECAY_PARAMS)],
     'weight_decay_rate': WEIGHT_DECAY},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in NO_DECAY_PARAMS)],
     'weight_decay_rate': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LR)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [15]:
# Training Loop - save model with best val accuracy

best_val_acc = None

for i in trange(EPOCHS, desc="Epoch"):
    
    model.classifier.train()
    
    total_train_loss = 0
    
    batch_start_index = 0
    pbar = tqdm(initial=0, total=len(train_data), desc="Training", leave=False)
    while batch_start_index < len(train_data):
        batch = train_data[batch_start_index : batch_start_index + BATCH_SIZE]
        
        loss = model.loss(batch["tweet_text"].tolist(), batch["tweet_label"].values)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item() * len(batch)
        
        batch_start_index += len(batch)
        pbar.update(len(batch))
        pbar.set_postfix({"Epoch Train Loss": total_train_loss / batch_start_index})
        
    model.classifier.eval()
    
    total_val_acc = 0
    
    batch_start_index = 0
    pbar = tqdm(initial=0, total=len(val_data), desc="Validating", leave=False)
    while batch_start_index < len(val_data):
        batch = val_data[batch_start_index : batch_start_index + BATCH_SIZE]
        
        with torch.no_grad():
            logits = model.classify(batch["tweet_text"].tolist())
        labels = np.array(batch["tweet_label"].values)
        
        total_val_acc += (np.argmax(logits, axis=1) == labels).sum()
        
        batch_start_index += len(batch)
        pbar.update(len(batch))
        
    epoch_train_loss = total_train_loss / len(train_data)
    epoch_val_acc = total_val_acc / len(val_data)
    tqdm.write(f"Epoch {i}: Train Loss = {epoch_train_loss:.5f}, Val Accuracy = {epoch_val_acc:.5f}")
    
    if best_val_acc is None or epoch_val_acc > best_val_acc:
        tqdm.write("Saving Model")
        torch.save(model.classifier.state_dict(), MODEL_OUTPUT_PATH)
        best_val_acc = epoch_val_acc

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 0: Train Loss = 0.26384, Val Accuracy = 0.95608
Saving Model


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 1: Train Loss = 0.09860, Val Accuracy = 0.94682


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 2: Train Loss = 0.04793, Val Accuracy = 0.95679
Saving Model


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 3: Train Loss = 0.02085, Val Accuracy = 0.95418


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 4: Train Loss = 0.01227, Val Accuracy = 0.96534
Saving Model


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 5: Train Loss = 0.00566, Val Accuracy = 0.94611


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 6: Train Loss = 0.01297, Val Accuracy = 0.96225


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 7: Train Loss = 0.00823, Val Accuracy = 0.96439


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 8: Train Loss = 0.01158, Val Accuracy = 0.95655


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 9: Train Loss = 0.00910, Val Accuracy = 0.95964


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 10: Train Loss = 0.00171, Val Accuracy = 0.96391


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 11: Train Loss = 0.00031, Val Accuracy = 0.96368


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 12: Train Loss = 0.01083, Val Accuracy = 0.96486


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 13: Train Loss = 0.00935, Val Accuracy = 0.95845


Training:   0%|          | 0/6319 [00:00<?, ?it/s]

Validating:   0%|          | 0/4212 [00:00<?, ?it/s]

Epoch 14: Train Loss = 0.00214, Val Accuracy = 0.96439


In [9]:
# OLD TRAINING SCRIPT

'''
# Tokenizer Setup
MAX_TEXT_LEN = 128
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def Encode_TextWithAttention(sentence, tokenizer, maxlen, padding_type='max_length', attention_mask_flag=True):
    encoded_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=maxlen, truncation=True, padding=padding_type, return_attention_mask=attention_mask_flag)
    return encoded_dict['input_ids'], encoded_dict['attention_mask']

def Encode_TextWithoutAttention(sentence, tokenizer, maxlen, padding_type='max_length', attention_mask_flag=False):
    encoded_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=maxlen, truncation=True, padding=padding_type, return_attention_mask=attention_mask_flag)
    return encoded_dict['input_ids']

def get_TokenizedTextWithAttentionMask(sentenceList, tokenizer):
    token_ids_list, attention_mask_list = [], []
    for sentence in sentenceList:
        token_ids, attention_mask = Encode_TextWithAttention(sentence, tokenizer, MAX_TEXT_LEN)
        token_ids_list.append(token_ids)
        attention_mask_list.append(attention_mask)
    return token_ids_list, attention_mask_list

def get_TokenizedText(sentenceList, tokenizer):
    token_ids_list = []
    for sentence in sentenceList:
        token_ids = Encode_TextWithoutAttention(sentence, tokenizer, MAX_TEXT_LEN)
        token_ids_list.append(token_ids)
    return token_ids_list



# Data preprocess and splitting
VAL_FRACTION = 0.2
labeled_data = pd.read_csv("compiled_data/combined_labeled_data.csv")
sentences, labels = labeled_data["tweet_text"].values, labeled_data["tweet_label"].values
N = len(labels)
random_order = np.random.permutation(N)
train_sentences = sentences[random_order[int(N * VAL_FRACTION):]]
train_labels = labels[random_order[int(N * VAL_FRACTION):]]
val_sentences = sentences[random_order[:int(N * VAL_FRACTION)]]
val_labels = labels[random_order[:int(N * VAL_FRACTION)]]



# Create Tensor Datasets
train_token_ids, train_attention_masks = torch.tensor(get_TokenizedTextWithAttentionMask(train_sentences,tokenizer))
val_token_ids, val_attention_masks = torch.tensor(get_TokenizedTextWithAttentionMask(val_sentences,tokenizer))

train_labels = torch.tensor(train_labels).type(torch.LongTensor)
val_labels = torch.tensor(val_labels).type(torch.LongTensor)

train_data = TensorDataset(train_token_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

validation_data = TensorDataset(val_token_ids, val_attention_masks, val_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)



# Load Model and Setup Optimizer

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).cuda()

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)



# Training Loop

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

train_loss_set = []
best_val_accuracy = None

for _ in trange(EPOCHS, desc="Epoch"):
    model.train()
    tr_loss_sum = 0
    nb_tr_examples, nb_tr_steps = 0, 0
  
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        print(b_input_ids.shape, b_labels.shape)
        optimizer.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
        train_loss_set.append(loss.item())    
        loss.backward()
        optimizer.step()

        tr_loss_sum += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss_sum / nb_tr_steps))

    model.eval()

    val_accuracy_sum = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
          logits = output[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        val_accuracy_sum += flat_accuracy(logits, label_ids)
        nb_eval_steps += 1

    val_accuracy = val_accuracy_sum / nb_eval_steps
    print(f"Validation Accuracy: {val_accuracy}")
    if(best_val_accuracy is None or val_accuracy >= best_val_accuracy):
        torch.save(model.state_dict(), f"{MODEL_FOLDER}/BERT_extradata.ckpt")
        best_val_accuracy = val_accuracy
        print('Model Saved')


'''

'\n# Tokenizer Setup\nMAX_TEXT_LEN = 128\ntokenizer = BertTokenizer.from_pretrained(\'bert-base-uncased\', do_lower_case=True)\n\ndef Encode_TextWithAttention(sentence, tokenizer, maxlen, padding_type=\'max_length\', attention_mask_flag=True):\n    encoded_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=maxlen, truncation=True, padding=padding_type, return_attention_mask=attention_mask_flag)\n    return encoded_dict[\'input_ids\'], encoded_dict[\'attention_mask\']\n\ndef Encode_TextWithoutAttention(sentence, tokenizer, maxlen, padding_type=\'max_length\', attention_mask_flag=False):\n    encoded_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=maxlen, truncation=True, padding=padding_type, return_attention_mask=attention_mask_flag)\n    return encoded_dict[\'input_ids\']\n\ndef get_TokenizedTextWithAttentionMask(sentenceList, tokenizer):\n    token_ids_list, attention_mask_list = [], []\n    for sentence in sentenceList:\n        tok