# BiLSTM benchmark model

## Setting up environment

In [None]:
import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error
from apex import amp

In [None]:
### Getting GPU type
print(torch.cuda.get_device_name(0))
if torch.cuda.is_available():
    print('Is available')
else:
    print('is not available')

In [None]:
# set path to media-bias-prediction repository 
repo_path = os.path.dirname(os.getcwd())
# set working directory to deep learning models directory
os.chdir(os.path.join(repo_path, 'deep_learning_models'))

## Loading and preparing data

In [None]:
##### Loading tensors 
os.chdir(os.path.join(repo_path, 'data_preparation','allsides_data'))

bias_train = torch.load('allsides_bias_train.pt')
bias_val = torch.load('allsides_bias_val.pt')
bias_test = torch.load('allsides_bias_test.pt')


text_train = torch.load('allsides_duplicates_removed_contents_text_train.pt')
text_val = torch.load('allsides_contents_text_val.pt')
text_test = torch.load('allsides_contents_text_test.pt')

os.chdir(os.path.join(repo_path, 'deep_learning_models'))

In [None]:
### removing news aggregators, tabloids, and wrongly labeled source from tensors
os.chdir(os.path.join(repo_path, 'data_preparation','allsides_data'))

allsides_source_train = np.load('allsides_source_train.npy', allow_pickle=True).flatten()
allsides_source_val = np.load('allsides_source_val.npy', allow_pickle=True).flatten()
allsides_source_test = np.load('allsides_source_test.npy', allow_pickle=True).flatten()

# sources to be removed:
wrongly_labeled = ['RightWingWatch']
news_aggregators = ['Drudge Report', 'Real Clear Politics', 'Yahoo News', 'RightWingWatch'] 
tabloids = ['New York Daily News', 'Daily Mail', 'New York Post'] 
unwanted_sources = wrongly_labeled + news_aggregators + tabloids

# creating boolean array to mark unwanted sources
boolean_array_train = np.full((len(allsides_source_train), ), False)
boolean_array_val = np.full((len(allsides_source_val), ), False)
boolean_array_test = np.full((len(allsides_source_test), ), False)

for source in unwanted_sources:
    boolean_array_train += allsides_source_train==source
    boolean_array_val += allsides_source_val==source 
    boolean_array_test += allsides_source_test==source 
    
# boolean to remove aggregators
inverted_boolean_array_train = np.invert(boolean_array_train)
inverted_boolean_array_val = np.invert(boolean_array_val)
inverted_boolean_array_test = np.invert(boolean_array_test)

# bias
bias_train = bias_train[inverted_boolean_array_train]
bias_val = bias_val[inverted_boolean_array_val]
bias_test = bias_test[inverted_boolean_array_test]

# text 
text_train = text_train[inverted_boolean_array_train]
text_val = text_val[inverted_boolean_array_val]
text_test = text_test[inverted_boolean_array_test]

# sources
allsides_source_train = allsides_source_train[inverted_boolean_array_train]
allsides_source_val = allsides_source_val[inverted_boolean_array_val]
allsides_source_test = allsides_source_test[inverted_boolean_array_test]

os.chdir(os.path.join(repo_path, 'deep_learning_models'))

In [None]:
### Creating training, validation, and test sets for pytorch model
train_set = TensorDataset(text_train, bias_train)
val_set = TensorDataset(text_val, bias_val)
test_set = TensorDataset(text_test, bias_test)

## Model Classes

In [None]:
def attention(query, key, value, mask=None, dropout=None):
    """
    Compute 'Scaled Dot Product Attention' from 'Attention is all you need'
    """  
    hidden_size = query.size(-1)
    attention_scores = torch.matmul(query, key.transpose(-2, -1)) / np.sqrt(hidden_size)
    attention_weights = F.softmax(attention_scores, dim = -1)
    if dropout is not None:
        attention_weights = dropout(attention_weights)
    return torch.matmul(attention_weights, value), attention_weights

In [None]:
class LSTMBlock(nn.Module):
    """
    Builds block consisting of an LSTM and a fully connected layer
    """
    def __init__(self, hidden_size, dropout_prob, apply_attention=False, last_layer=False, bidirectional=True):
        """
        hidden_size: Number of hidden neurons in LSTM and number of feature inputs (size of embedding/size of 
                     hidden layer output in stacked LSTM )
        last_layer: Indicates whether to add "boom" layer (False) or not (True)
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.directions = 1 + bidirectional
        self.last_layer = last_layer
        self.apply_attention = apply_attention
        
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True, bidirectional=bidirectional)

        self.layer_norm_input = nn.LayerNorm(hidden_size)
        self.layer_norm_lstm = nn.LayerNorm(self.directions*hidden_size)
        
        if apply_attention:
            self.layer_norm_querry = nn.LayerNorm(self.directions*hidden_size, eps=1e-12)
            self.layer_norm_key = nn.LayerNorm(self.directions*hidden_size)
            self.layer_norm_value = nn.LayerNorm(self.directions*hidden_size)

            self.qs = nn.Parameter(torch.zeros(size=(1, 1, self.directions*hidden_size), dtype=torch.float))
            self.ks = nn.Parameter(torch.zeros(size=(1, 1, self.directions*hidden_size), dtype=torch.float))
            self.vs = nn.Parameter(torch.zeros(size=(1, 1, self.directions*hidden_size), dtype=torch.float))

            self.querry_projection = nn.Linear(self.directions*hidden_size,self.directions*hidden_size)
            self.linear_over_param = nn.Linear(self.directions*hidden_size, 2*self.directions*hidden_size) 

        if not last_layer: 
            self.dropout = nn.Dropout(dropout_prob)
            self.boom_a = nn.Linear(hidden_size*self.directions,4*self.directions*hidden_size) 
            self.activation = nn.GELU() 
            self.boom_b = nn.Linear(4*self.directions*hidden_size, hidden_size) 
    
    def forward(self, feature_input):
        ### Adjust batch size in case of last batch being shorter 
        current_batch_size = feature_input.shape[0]
        # Initilize hidden and cell state
        hidden_0 = torch.zeros(self.directions, current_batch_size, self.hidden_size).to(device)
        # Applying layer normalization to input
        feature_input = self.layer_norm_input(feature_input)
        # LSTM layer with embeddings/hidden-state inputs 
        lstm_out, (last_hidden,last_cell) = self.lstm(feature_input, (hidden_0,hidden_0))

        if self.apply_attention:
            # Taken from Merity (2019):
            # matrix multiplication and layer normalization on querry
            querry = self.layer_norm_querry(self.querry_projection(lstm_out))
            # only layer normalization on key and value
            key = self.layer_norm_key(lstm_out)
            value = self.layer_norm_value(lstm_out)
            # activation of parameter vectors
            qs, ks, vs = torch.sigmoid(self.qs), torch.sigmoid(self.ks), torch.sigmoid(self.vs) 
            # over parameterizing of value parameter vector, using forget gate and candidate (Merity 2019, 6.4)
            candidate, forget = self.linear_over_param(vs).split(self.directions*self.hidden_size, dim=-1) 
            vs = torch.sigmoid(forget) * torch.tanh(candidate) 
            # multiplaying parameter vectors with querry, key, and value respectively
            q, k, v, = qs*querry, ks*key, vs*value 
            # apply scaled dot product attention
            lstm_out, attention_weights = attention(q,k,v, dropout=self.dropout)

        # Applying layer normalization to lstm output
        lstm_out = self.layer_norm_lstm(lstm_out)

        if self.last_layer:
            return lstm_out, last_hidden
        else:
            # big fully connected layer taking shape(batch, seq_len, num_directions * hidden_size) 
            # and returning shape(batch, seq_len, hidden_size)
            boom_out = self.boom_b(self.dropout(self.activation(self.boom_a(lstm_out))))
            return boom_out

In [None]:
class Model(nn.Module):
    def __init__(self, hidden_size, num_labels, num_layers, 
                 vocabulary_size, dropout_prob = 0.1, bidirectional = True, attention_layer=False):
        """
        hidden_size: Number or hidden neurons in LSTM, also used for embedding size
        num_labels: Number of target labels
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.num_layers = num_layers
        self.directions = 1 + bidirectional
        self.attention_layer = attention_layer
        
        self.embedding = nn.Embedding(vocabulary_size, hidden_size, padding_idx=0)

        self.blocks = nn.ModuleList()
        for i in range(num_layers):
            # last layer
            if i==num_layers-1:
                self.blocks.append(LSTMBlock(hidden_size, dropout_prob=0, last_layer=True)) 
            # second last layer with attention
            elif i==num_layers-2:
                self.blocks.append(LSTMBlock(hidden_size, dropout_prob=dropout_prob, apply_attention=self.attention_layer))
            # other layers
            else:
                self.blocks.append(LSTMBlock(hidden_size, dropout_prob=dropout_prob)) 

        
        self.dropout = nn.Dropout(dropout_prob)
        self.classifier_a = nn.Linear(hidden_size*self.directions,4*self.directions*hidden_size) 
        self.activation = nn.GELU() 
        self.classifier_b = nn.Linear(4*self.directions*hidden_size, hidden_size) 
        self.classifier_c = nn.Linear(hidden_size, num_labels) 
        
    def forward(self, text):
        embeddings = self.embedding(text)

        ### LSTM + "Boom"-layer blocks
        for i,block in enumerate(self.blocks):
            # only single layer
            if len(self.blocks)==1:
                last_hidden = block(embeddings)
            # first layer
            elif i==0:
                block_out = block(embeddings)
            # last layer
            elif i==len(self.blocks)-1:
                lstm_out, last_hidden = block(block_out)
            # other layers
            else:
                block_out = block(block_out)

        if self.directions==2:
            # adjust last hidden state output to shape (batch_size,directions*hidden_size), i.e. concatinating both directions
            last_hidden = torch.cat((last_hidden[0,:,:],last_hidden[1,:,:]), axis=1) 
        
        ### Classifier layer
        output = self.classifier_b(self.dropout(self.activation(self.classifier_a(last_hidden))))
        output = self.classifier_c(output)
        return output

## Training Function

In [None]:
def train_fct(train_set, batch_size, optimizer, return_mse=True, batch_feedback=500, first_check=10, mixed_precision=False, 
              save_memory_usage=False):

    start_time = time.time()
    # Setting model to train mode (so dropout is applied)
    model.train()
    # creating iterable dataset devided into batches and shuffled
    data = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    # tracking batches, loss, accuracy
    total_batch_count = int(len(train_set)/batch_size)
    batch_counter = 0
    train_loss = 0
    train_correctly_specified = 0
    train_predicted_values = []
    train_true_values = []
    
    # Tracking memory usage
    if save_memory_usage:
        ! nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -f memory_usage.csv 

    # looping over batches
    for text, label in data:
        # sending tensors to GPU
        text, label = text.to(device), label.to(device)
        # clearing gradients
        optimizer.zero_grad()
        # run through model
        output = model(text)
        # calculating loss    
        loss = loss_fct(output, label)
        # backpropagation
        if mixed_precision:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        
        # updating weights
        optimizer.step()
 
        # loss and metrices messures
        train_loss += loss.item()
        train_correctly_specified += (output.argmax(1) == label).sum().item()
        
        train_predicted_values.append(output.argmax(1))
        train_true_values.append(label)
        
        batch_counter += 1
        
        if (batch_counter % batch_feedback == 0) or (batch_counter == first_check):
            time_so_far = time.time() - start_time
            minutes = int(time_so_far // 60)
            seconds = int(time_so_far % 60)
            average_progress_loss = train_loss/batch_counter
            progress_acc = train_correctly_specified/(batch_counter*batch_size)  
            print('-------------------------------------------------')
            print(f'{batch_counter:5} of {total_batch_count:5} batches done after {minutes:3} min {seconds:2} sec')
            print('-------------------------------------------------')
            print(f'loss: {average_progress_loss:6.4}   |   acc: {progress_acc:6.4}')
            print('-------------------------------------------------')
            #adding memory value
            if save_memory_usage:
                ! nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits >> memory_usage.csv

    
    # loss
    average_total_loss = train_loss/(len(train_set)/batch_size)
    # accuracy
    total_accuracy = train_correctly_specified/len(train_set) 
    # Predicted and true values
    train_predicted_values = torch.cat(train_predicted_values).cpu().numpy()
    train_true_values = torch.cat(train_true_values).cpu().numpy()
    # Precision
    train_precision = precision_score(train_true_values, train_predicted_values, average='macro')
    # Recall
    train_recall = recall_score(train_true_values, train_predicted_values, average='macro')
    # F1 score
    train_f1_score = f1_score(train_true_values, train_predicted_values, average='macro')
    # Mean Squared Error
    if return_mse:
        train_mse = mean_squared_error(train_true_values,train_predicted_values)
    else: 
        train_mse = None
    
    # Loading memory usage to get maxium
    if save_memory_usage:
        memory_usage = np.loadtxt('memory_usage.csv', dtype='int', delimiter = ',') # csv-file name
        max_memory_usage = int(np.max(memory_usage))
    else:
        max_memory_usage = None

    return average_total_loss, total_accuracy, train_precision, train_recall, train_f1_score, train_mse, max_memory_usage
 

In [None]:
def num_parameters(model):
    return sum(layer.numel() for layer in model.parameters())

## Validation/Testing function

In [None]:
##### Function for validation after 1 epoch of training
def val_fct(val_set, batch_size, return_mse=True):
    print('----------- Validation/Test Start -----------')
    # Setting model to evaluation mode (dropout is not applied)
    model.eval()
    # creating iterable dataset devided into batches, not shuffeled
    data = DataLoader(val_set, batch_size = batch_size)
    # setting up loss and accuracy variables
    val_loss = 0
    val_predicted_values = []
    val_true_values = []
    # looping over batches
    for text, label in data:
        text, label = text.to(device), label.to(device)
        # no gradient calculation during validation
        with torch.no_grad():
            output = model(text)
            loss = loss_fct(output, label)

            val_loss += loss.item()
            val_predicted_values.append(output.argmax(1))
            val_true_values.append(label)
    
    # loss
    average_val_loss = val_loss/(len(val_set)/batch_size)
    # true and predicted values
    val_predicted_values = torch.cat(val_predicted_values).cpu().numpy()
    val_true_values = torch.cat(val_true_values).cpu().numpy()
    # Accuracy
    val_accuracy = (val_predicted_values==val_true_values).sum().item()/len(val_set) 
    # Precision
    val_precision = precision_score(val_true_values, val_predicted_values, average='macro')
    # Recall
    val_recall = recall_score(val_true_values, val_predicted_values, average='macro')
    # F1 score
    val_f1_score = f1_score(val_true_values, val_predicted_values, average='macro')
    # Mean squared error
    if return_mse:
        val_mse = mean_squared_error(val_true_values,val_predicted_values)
    else:
        val_mse = None
        
    return average_val_loss, val_accuracy, val_precision, val_recall, val_f1_score, val_mse
    

In [None]:
### Device to run model on, either GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Model inputs
hidden_size = 512
num_labels = 5
num_layers = 4
vocabulary_size = 30522
dropout_prob = 0.1
bidirectional = True
attention_layer = True
### Hyperparameters
batch_size = 64 ######## alternatively to save memory: 16
learning_rates_list = [2e-5, 2e-5, 2e-5, 1e-5, 1e-5, 1e-5]
### Use of nvidia apex for mixed precession calculations
mixed_precision = True 

In [None]:
##### Initilize model
model = Model(hidden_size, num_labels, num_layers, vocabulary_size, 
              dropout_prob, bidirectional, attention_layer).to(device)

### Loss function
loss_fct = nn.CrossEntropyLoss().to(device)

In [None]:
def model_training_fct(deviation_case, num_epochs, seed):
    '''
    Function to train model for a given number of epochs and saving all necessery figures and model weights
    '''
    global model
    ### Dictionary to save metrices
    metric_scores = {'epoch': [], 'time': [], 
                     'train_loss': [], 'train_acc': [], 'train_precision': [], 'train_recall': [], 'train_f1_score': [], 'train_mse': [],
                     'val_loss': [], 'val_acc': [], 'val_precision': [], 'val_recall': [], 'val_f1_score': [], 'val_mse': [],
                     'test_loss': [], 'test_acc': [], 'test_precision': [], 'test_recall': [], 'test_f1_score': [], 'test_mse': [], 'memory': []}

    print(f'--- Number of parameters: {num_parameters(model):,} ---') 

    for epoch in range(1,num_epochs+1):
        epoch_start_time = time.time()
        # choose learning rate for this epoch
        learning_rate = learning_rates_list[epoch-1]
        optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
        
        if mixed_precision:
            model, optimizer = amp.initialize(model, optimizer, opt_level="O1") 
        print(f'+ Learning rate used in epoch {epoch}: {learning_rate} +')

        # Training for 1 epoch
        train_loss, train_acc, train_precision, train_recall, \
        train_f1_score, train_mse, max_memory_usage = train_fct(train_set, 
                                                                batch_size,
                                                                optimizer,
                                                                batch_feedback=500, 
                                                                first_check=100, 
                                                                mixed_precision = mixed_precision,
                                                                save_memory_usage = True)     
        # Validation
        val_loss, val_acc, val_precision, val_recall, val_f1_score, val_mse = val_fct(val_set, batch_size)
        
        # Testing
        test_loss, test_acc, test_precision, test_recall, test_f1_score, test_mse = val_fct(test_set, batch_size)
        
        # Display progress
        end_time = time.time() - epoch_start_time
        minutes = int(end_time // 60)
        seconds = int(end_time % 60)
        print('+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +')
        print(f'+ Epoch: {epoch} took {minutes:3} min, {seconds:2} sec                             +')
        try:
            print(f'+ Maximum memory usage: {max_memory_usage:5} MiB                           +')
        except TypeError:
            pass
        print(f'+ (Training)   Loss: {train_loss:6.4}  |  Acc: {train_acc:6.4}  |  F1: {train_f1_score:6.4}  +')
        print(f'+ (Validation) Loss: {val_loss:6.4}  |  Acc: {val_acc:6.4}  |  F1: {val_f1_score:6.4}  +')
        print('+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +')

        # saving metrices
        current_epoch_score_metrics = ['epoch', 'time', 
                                       'train_loss', 'train_acc', 'train_precision', 'train_recall', 'train_f1_score', 'train_mse',
                                       'val_loss', 'val_acc', 'val_precision', 'val_recall', 'val_f1_score', 'val_mse',
                                       'test_loss', 'test_acc', 'test_precision', 'test_recall', 'test_f1_score', 'test_mse', 'memory']
        current_epoch_score_values = [epoch, round(end_time/60,2), 
                                      train_loss, train_acc, train_precision, train_recall, train_f1_score, train_mse,
                                      val_loss, val_acc, val_precision, val_recall, val_f1_score, val_mse,
                                      test_loss, test_acc, test_precision, test_recall, test_f1_score, test_mse, 
                                      max_memory_usage]
        for metric,value in zip(current_epoch_score_metrics, current_epoch_score_values):
            metric_scores[metric].append(value)

        # saving model weights 
        if mixed_precision:
            checkpoint = {'model': model.state_dict(),
                          'optimizer': optimizer.state_dict(),
                          'amp': amp.state_dict()}

            torch.save(checkpoint, f'dl_benchmark_weights/amp_checkpoint_{deviation_case}_epoch{epoch}.pt')
        else:
            torch.save(model.state_dict(), f'dl_benchmark_weights/model_weights_{deviation_case}_epoch{epoch}.pt')

        # saving final scores
        if epoch==num_epochs:
            results = pd.DataFrame(metric_scores)
            results.to_csv(f'dl_benchmark_scores/metric_scores_{deviation_case}.csv', index=False)

In [None]:
### #RUN ###
run = 1  ###
############
num_epochs = 5
### name experiment
deviation_case = f'dl_benchmark_allsides_all_removed_rerun_{run}'
seed = 19 + run #20,21,22

torch.manual_seed(seed)
model_training_fct(deviation_case, num_epochs, seed)

In [None]:
### Stopping instance
! gcloud compute instances stop t4-instance --zone=europe-west4-c