# Model training and evaluation of BERT based model

## Setting up environment

In [1]:
import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
import transformers
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error
from apex import amp

In [2]:
### Getting GPU type
print(torch.cuda.get_device_name(0))
if torch.cuda.is_available():
    print('Is available')
else:
    print('GPU is not available')

Tesla T4
Is available


In [3]:
# set path to media-bias-prediction repository (if it breaks, set media-bias-prediction absolute path manually)
repo_path = os.path.dirname(os.getcwd())
# set working directory to deep learning models directory
os.chdir(os.path.join(repo_path, 'deep_learning_models'))

## Choose experiment parameters

In [5]:
### Remove news aggregatores from dataset
remove_aggregators = True

### Remove tabloid articles from dataset
remove_tabloids = True

### Use training data of which frequent sentences are removed
remove_duplicates = True

### Choose if and which one of two groups of sources should be removed from training data 
### (small and large with each containing one source per bias category)
remove_source_group = None # 'small' # 'large' 

### Apply cost sensitive loss
cost_sensitive = False 

**Experiment is run at the end of the notebook. There, number of run and name of the experiment need to be chosen.**

## Loading and preparing data

In [6]:
# change working directory to data directory
os.chdir(os.path.join(repo_path, 'data_preparation','allsides_data'))

bias_train = torch.load('allsides_bias_train.pt')
bias_val = torch.load('allsides_bias_val.pt')
bias_test = torch.load('allsides_bias_test.pt')

if remove_duplicates:
    text_train = torch.load('allsides_duplicates_removed_contents_text_train.pt') 
    mask_train = torch.load('allsides_duplicates_removed_contents_mask_train.pt') 
else:
    text_train = torch.load('allsides_contents_text_train.pt') 
    mask_train = torch.load('allsides_contents_mask_train.pt') 

text_val = torch.load('allsides_contents_text_val.pt')
mask_val = torch.load('allsides_contents_mask_val.pt')

text_test = torch.load('allsides_contents_text_test.pt')
mask_test = torch.load('allsides_contents_mask_test.pt')

# change working directory back to deep learning models directory (to decrease risk of path breaks)
os.chdir(os.path.join(repo_path, 'deep_learning_models'))

In [7]:
### removing news aggregators, tabloids, and wrongly labeled source from tensors 
os.chdir(os.path.join(repo_path, 'data_preparation','allsides_data'))

allsides_source_train = np.load('allsides_source_train.npy', allow_pickle=True).flatten()
allsides_source_val = np.load('allsides_source_val.npy', allow_pickle=True).flatten()
allsides_source_test = np.load('allsides_source_test.npy', allow_pickle=True).flatten()

# sources to be removed:
wrongly_labeled = ['RightWingWatch']
news_aggregators = ['Drudge Report', 'Real Clear Politics', 'Yahoo News'] 
tabloids = ['New York Daily News', 'Daily Mail', 'New York Post']

### choosing sources to remove 
unwanted_sources = wrongly_labeled 
if remove_aggregators:
    unwanted_sources += news_aggregators
if remove_tabloids:
    unwanted_sources += tabloids

# creating boolean array to mark unwanted sources
boolean_array_train = np.full((len(allsides_source_train), ), False)
boolean_array_val = np.full((len(allsides_source_val), ), False)
boolean_array_test = np.full((len(allsides_source_test), ), False)

for source in unwanted_sources:
    boolean_array_train += allsides_source_train==source
    boolean_array_val += allsides_source_val==source 
    boolean_array_test += allsides_source_test==source 
# boolean to remove aggregators
inverted_boolean_array_train = np.invert(boolean_array_train)
inverted_boolean_array_val = np.invert(boolean_array_val)
inverted_boolean_array_test = np.invert(boolean_array_test)

# bias
bias_train = bias_train[inverted_boolean_array_train]
bias_val = bias_val[inverted_boolean_array_val]
bias_test = bias_test[inverted_boolean_array_test]

# text and masks
text_train = text_train[inverted_boolean_array_train]
text_val = text_val[inverted_boolean_array_val]
text_test = text_test[inverted_boolean_array_test]
mask_train = mask_train[inverted_boolean_array_train]
mask_val = mask_val[inverted_boolean_array_val]
mask_test = mask_test[inverted_boolean_array_test]

# sources
allsides_source_train = allsides_source_train[inverted_boolean_array_train]
allsides_source_val = allsides_source_val[inverted_boolean_array_val]
allsides_source_test = allsides_source_test[inverted_boolean_array_test]

os.chdir(os.path.join(repo_path, 'deep_learning_models'))

In [8]:
### removing one source per category for robustness testing
    
# sources to be removed:
large_group = ['Reuters', 'ABC News', 'Fox News', 'Politicus USA', 'CNS News']
small_group = ['FiveThirtyEight', 'Washington Monthly', 'The Washington Examiner', 
               'Daily Kos', 'FrontPage Magazine']

if (remove_source_group == 'large') :
    group_removed = large_group
elif remove_source_group == 'small':
    group_removed = small_group
else:
    group_removed = None

if group_removed:
    # creating boolean array to mark unwanted sources
    boolean_array_train = np.full((len(allsides_source_train), ), False)

    for source in group_removed:
        boolean_array_train += allsides_source_train==source

    # boolean to remove aggregators
    inverted_boolean_array_train = np.invert(boolean_array_train)

    # bias
    bias_train = bias_train[inverted_boolean_array_train]

    # text and masks
    text_train = text_train[inverted_boolean_array_train]
    mask_train = mask_train[inverted_boolean_array_train]

    # sources
    allsides_source_train = allsides_source_train[inverted_boolean_array_train]


In [9]:
train_set = TensorDataset(text_train, mask_train, bias_train)
val_set = TensorDataset(text_val, mask_val, bias_val)
test_set = TensorDataset(text_test, mask_test, bias_test)

## Model class

In [10]:
class Model(nn.Module):
    def __init__(self, hidden_size, num_labels, droput_prob, bert_model_module, output_attentions=False, pooled_output = True):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.output_attentions = output_attentions
        self.pooled_output = pooled_output

        self.bert = bert_model_module
        self.dropout = nn.Dropout(dropout_prob)
        self.linear = nn.Linear(hidden_size,hidden_size)
        self.tanh = nn.Tanh()

        self.classifier_layer = nn.Linear(hidden_size, num_labels) # The values are initialized from U(−sqrt(k),sqrt(k)), where k=1/in_features

    def forward(self, text, mask):
        # token_type_ids and position_ids are created automaticly 
        bert_out = self.bert(input_ids = text, attention_mask = mask)
        
        if self.pooled_output:
            ### Pooled Output
            # Choosing only CLS token output and apply linear layer + TanH 
            pooled_out = bert_out[1]
            # Applying dropout
            pooled_out = self.dropout(pooled_out)

            out = self.classifier_layer(pooled_out)
        else:
            ### Last Layer average
            # summing up over sequence lenght and devide by unmasked sequence length 
            # resulting in tensor with shape (batch_size,hidden_size)
            last_layer = torch.sum(bert_out[0], dim=1)/torch.sum(mask,dim=1).reshape([len(mask),1])
            last_layer = self.tanh(self.linear(last_layer))
            last_layer = self.dropout(last_layer)
            out = self.classifier_layer(last_layer)
               
        # Saving attention layer outputs if set True
        if self.output_attentions:
            out = out, bert_out[2]
        
        return out

## Train Function

In [11]:
##### Function for training of 1 epoch

def train_fct(train_set, batch_size, return_mse=False, batch_feedback = 1000, first_check = 100, mixed_precision = False, save_memory_usage = False):
    start_time = time.time()
    # Setting model to train mode (so dropout is applied)
    model.train()
    # creating iterable dataset devided into batches and shuffled
    data = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    # tracking batches, loss, accuracy
    total_batch_count = int(len(train_set)/batch_size)
    batch_counter = 0
    train_loss = 0
    train_correctly_specified = 0
    train_predicted_values = []
    train_true_values = []
    
    # Tracking memory usage
    if save_memory_usage:
        ! nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -f memory_usage.csv 

    # looping over batches
    for text, mask, label in data:
        # sending tensors to GPU
        text, mask, label = text.to(device), mask.to(device), label.to(device)
        # clearing gradients
        optimizer.zero_grad()

        logits = model(text, mask)
        # calculating loss
        loss = loss_fct(logits, label)

        predictions = logits.argmax(1)
        # backpropagation
        if mixed_precision:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        # updating weights
        optimizer.step()
        # loss and metrices messures
        train_loss += loss.item()
        train_correctly_specified += (predictions == label).sum().item()
        
        train_predicted_values.append(predictions)
        train_true_values.append(label)
        
        batch_counter += 1

        if (batch_counter % batch_feedback == 0) or (batch_counter == first_check):
            time_so_far = time.time() - start_time
            minutes = int(time_so_far // 60)
            seconds = int(time_so_far % 60)
            average_progress_loss = train_loss/batch_counter
            progress_acc = train_correctly_specified/(batch_counter*batch_size)
            print('-------------------------------------------')
            print(f'{batch_counter:5} of {total_batch_count:5} batches done after {minutes:3} min {seconds:2} sec')
            print('-------------------------------------------')
            print(f'loss: {average_progress_loss:6.4}   |   acc: {progress_acc:6.4}')
            print('-------------------------------------------')
            if save_memory_usage:
                ! nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits >> memory_usage.csv
            
    # loss
    average_total_loss = train_loss/(len(train_set)/batch_size)
    # accuracy
    total_accuracy = train_correctly_specified/len(train_set) 
    # Predicted and true values
    train_predicted_values = torch.cat(train_predicted_values).cpu().numpy()
    train_true_values = torch.cat(train_true_values).cpu().numpy()
    # Precision
    train_precision = precision_score(train_true_values, train_predicted_values, average='macro')
    # Recall
    train_recall = recall_score(train_true_values, train_predicted_values, average='macro')
    # F1 score
    train_f1_score = f1_score(train_true_values, train_predicted_values, average='macro')
    if return_mse:
        train_mse = mean_squared_error(train_true_values,train_predicted_values)
    else: 
        train_mse = None
    
    # Loading memory usage to get maxium
    if save_memory_usage:
        memory_usage = np.loadtxt('memory_usage.csv', dtype='int', delimiter = ',') # csv-file name
        max_memory_usage = int(np.max(memory_usage))
    else:
        max_memory_usage = None
    
    return average_total_loss, total_accuracy, train_precision, train_recall, train_f1_score, train_mse, max_memory_usage


## Validation Function

In [12]:
##### Function for validation after 1 epoch of training

def val_fct(val_set, batch_size, return_mse=False, return_predicted_values=False):
    print('----------- Start Validation/Testing ----------')
    # Setting model to evaluation mode (dropout is not applied)
    model.eval()
    # creating iterable dataset devided into batches, not shuffeled
    data = DataLoader(val_set, batch_size = batch_size)
    # setting up loss and accuracy variables
    val_loss = 0
    #val_correctly_specified = 0
    val_predicted_values = []
    val_true_values = []
    # looping over batches
    for text, mask, label in data:
        text, mask, label = text.to(device), mask.to(device), label.to(device)
        # no gradient calculation during validation
        with torch.no_grad():
            logits = model(text, mask)
            # calculating loss
            loss = loss_fct(logits, label)            
            predictions = logits.argmax(1)

            val_loss += loss.item()
            val_predicted_values.append(predictions)
            val_true_values.append(label)
    
    # loss
    average_val_loss = val_loss/(len(val_set)/batch_size)
    # true and predicted values
    val_predicted_values = torch.cat(val_predicted_values).cpu().numpy()
    val_true_values = torch.cat(val_true_values).cpu().numpy()
    # Accuracy
    val_accuracy = (val_predicted_values==val_true_values).sum().item()/len(val_set) 
    # Precision
    val_precision = precision_score(val_true_values, val_predicted_values, average='macro')
    # Recall
    val_recall = recall_score(val_true_values, val_predicted_values, average='macro')
    # F1 score
    val_f1_score = f1_score(val_true_values, val_predicted_values, average='macro')
    # Mean squared error
    if return_mse:
        val_mse = mean_squared_error(val_true_values,val_predicted_values)
    else:
        val_mse = None
    
    if not return_predicted_values:
        val_predicted_values = None

    return average_val_loss, val_accuracy, val_precision, val_recall, val_f1_score, val_mse, val_predicted_values
    

In [13]:
def num_parameters(model):
    return sum(layer.numel() for layer in model.parameters())

## Cost sensitive tools

In [14]:
def create_cost_matrix(num_labels):
    '''
    Creates cost matrix with shape (num_labels,num_labels) that gives higher 
    weights the further away a prediction is from the true value. 
    Weights are calculated with: 0.2 + 0.2*difference_in_lable_values
    '''
    label_columns = torch.arange(num_labels)
    cost_matrix = torch.zeros((num_labels,num_labels))
    for i,row in enumerate(cost_matrix):
        cost_matrix[i] = torch.abs(label_columns-i) * 0.2 + 0.2
    return cost_matrix

def cost_sensitive_log_softmax(x, target, cost_matrix): 
    '''
    Weighting as in Khan (2017) under III.C.(c) Cost-Sensitive CE loss, equation (11)
    '''
    # numerator of softmax: multiplied with weight from cost matrix (cost_matrix[predicted_label,target_label])
    weights_predicted = cost_matrix[x.argmax(-1),target].reshape(-1,1).detach()
    numerator = x*weights_predicted
    # denominator of softmax: each value in sum multiplied with coresponding weights 
    # (cost_matrix[predicted_label,:])
    weights_all = cost_matrix[x.argmax(-1),:].detach()
    denominator = x*weights_all
    # after taking log:
    return numerator - denominator.exp().sum(-1).log().unsqueeze(-1)

class CostSensitiveCELoss(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.num_labels = num_labels
        self.cost_matrix = create_cost_matrix(num_labels).to(device)

    def forward(self, input, target):
        weighted_log_softmax_out = cost_sensitive_log_softmax(input, target, self.cost_matrix) 
        return F.nll_loss(weighted_log_softmax_out,target)

## Preparing Model

In [15]:
##### Loading Bert 
BertModel = transformers.BertModel

### Device to run model on, either GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Model inputs
hidden_size = 768
num_labels = 5 
dropout_prob = 0.1

### Hyperparameters
batch_size = 16 
learning_rate = 2e-5
### Use of nvidia apex for mixed precession calculations
mixed_precision = True 

In [16]:
##### Initilize and configure Bert
bert_model = BertModel.from_pretrained('bert-base-uncased') 

##### Initilize model 
model = Model(hidden_size, num_labels, dropout_prob, bert_model, pooled_output=True).to(device)

### Optimizer, choosing learning rate 
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
### Applying mixed precision to speed up model training 
if mixed_precision:
    model, optimizer = amp.initialize(model, optimizer, opt_level="O1") 
### Loss function
if cost_sensitive:
    loss_fct = CostSensitiveCELoss(num_labels).to(device)
else: 
    loss_fct = nn.CrossEntropyLoss().to(device)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [17]:
print(f'Number of parameters: {num_parameters(model):,}' )  

Number of parameters: 110,076,677


## Model Training

In [18]:
def model_training_fct(deviation_case, num_epochs, seed):
    '''
    Function to train model for a given number of epochs and saving all necessery figures and model weights
    '''
   
    ### Dictionary to save metrices
    metric_scores = {'epoch': [], 'time': [], 
                     'train_loss': [], 'train_acc': [], 'train_precision': [], 'train_recall': [], 'train_f1_score': [], 'train_mse': [],
                     'val_loss': [], 'val_acc': [], 'val_precision': [], 'val_recall': [], 'val_f1_score': [], 'val_mse': [],
                     'test_loss': [], 'test_acc': [], 'test_precision': [], 'test_recall': [], 'test_f1_score': [], 'test_mse': [], 'memory': []}
    
    for epoch in range(1,num_epochs+1):
        epoch_start_time = time.time()
        # Training for 1 epoch
        train_loss, train_acc, train_precision, train_recall, \
        train_f1_score, train_mse, max_memory_usage = train_fct(train_set, 
                                                                batch_size, 
                                                                return_mse=True, 
                                                                batch_feedback=5000, 
                                                                first_check=100,
                                                                mixed_precision = mixed_precision, 
                                                                save_memory_usage = True)
        # Validation
        val_loss, val_acc, val_precision, val_recall, \
        val_f1_score, val_mse, val_predicted_values = val_fct(val_set, 
                                                              batch_size, 
                                                              return_mse=True, 
                                                              return_predicted_values=True)

        test_loss, test_acc, test_precision, test_recall, \
        test_f1_score, test_mse, test_predicted_values = val_fct(test_set, 
                                                                 batch_size, 
                                                                 return_mse=True, 
                                                                 return_predicted_values=True)

        # Display results
        end_time = time.time() - epoch_start_time
        minutes = int(end_time // 60)
        seconds = int(end_time % 60)
        print('+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +' + (' + + + + + + +' if train_mse else ''))
        print(f'+ Epoch: {epoch} took {minutes:3} min, {seconds:2} sec                             ')
        try:
            print(f'+ Maximum memory usage: {max_memory_usage:5} MiB')
        except TypeError:
            pass
        print(f'+ (Training)   Loss: {train_loss:6.4}  |  Acc: {train_acc:6.4}  |  F1: {train_f1_score:6.4}  ' + (f'|  MSE: {train_mse:.4}' if train_mse else ''))
        print(f'+ (Validation) Loss: {val_loss:6.4}  |  Acc: {val_acc:6.4}  |  F1: {val_f1_score:6.4}  ' + (f'|  MSE: {val_mse:.4}' if val_mse else ''))
        print('+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +' + (' + + + + + + +' if train_mse else ''))    

        # saving metrices
        current_epoch_score_metrics = ['epoch', 'time', 
                                       'train_loss', 'train_acc', 'train_precision', 'train_recall', 'train_f1_score', 'train_mse',
                                       'val_loss', 'val_acc', 'val_precision', 'val_recall', 'val_f1_score', 'val_mse',
                                       'test_loss', 'test_acc', 'test_precision', 'test_recall', 'test_f1_score', 'test_mse', 'memory']
        current_epoch_score_values = [epoch, round(end_time/60,2), 
                                      train_loss, train_acc, train_precision, train_recall, train_f1_score, train_mse,
                                      val_loss, val_acc, val_precision, val_recall, val_f1_score, val_mse,
                                      test_loss, test_acc, test_precision, test_recall, test_f1_score, test_mse, 
                                      max_memory_usage]
        for metric,value in zip(current_epoch_score_metrics, current_epoch_score_values):
            metric_scores[metric].append(value)
        
        # saving model weights 
        if mixed_precision:
            checkpoint = {'model': model.state_dict(),
                          'optimizer': optimizer.state_dict(),
                          'amp': amp.state_dict()}

            torch.save(checkpoint, f'weights/amp_checkpoint_{deviation_case}_epoch{epoch}.pt')
        else:
            torch.save(model.state_dict(), f'weights/model_weights_{deviation_case}_epoch{epoch}.pt')

        # saving final scores
        if epoch==num_epochs:
            results = pd.DataFrame(metric_scores)
            results.to_csv(f'scores/metric_scores_{deviation_case}.csv', index=False)
            
            

From the GitHub Discussion regarding gradient overflow: "Occasionally seeing a message like “overflow detected, skipping step, reducing loss scale” is normal behavior with dynamic loss scaling, and it usually happens in the first few iterations because Amp begins by trying a high loss scale."

### Rerunning model to decrease variance due to randomness 

In [None]:
### #RUN ###
run = 2  ###
############
num_epochs = 3
### name experiment
deviation_case = f'allsides_full_rerun_{run}'

seed = 19 + run # =20/21/22

torch.manual_seed(seed)
model_training_fct(deviation_case, num_epochs, seed)

-------------------------------------------
  100 of 12345 batches done after   1 min 16 sec
-------------------------------------------
loss:  1.528   |   acc: 0.3275
-------------------------------------------
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
-------------------------------------------
 5000 of 12345 batches done after  63 min 31 sec
-------------------------------------------
loss: 0.7675   |   acc: 0.7234
-------------------------------------------
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
-------------------------------------------
10000 of 12345 batches done after 127 min  2 sec
-------------------------------------------
loss: 0.6345   |   acc: 0.7721
---------

In [None]:
### Stopping instance
! gcloud compute instances stop t4-instance --zone=europe-west4-c