# Predicting on Semantic Evaluation 2019 dataset

In [1]:
import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
import transformers
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error

In [2]:
repo_path = os.path.dirname(os.getcwd())

## Choose model type

In [12]:
### choose model type #########
model_type = 'bilstm' #'bert' # 
###############################

### choose weights ##############################################
file_name =  'dl_benchmark_allsides_all_removed'                #
# 'allsides_aggregators_tabloids_duplicates_removed'            #
# 'allsides_full'                                               #
#################################################################

## Load data

In [5]:
# loading data
bias_semeval = torch.load('semeval_bias_tensor.pt')
text_semeval = torch.load('semeval_contents_text_tensor.pt')
mask_semeval = torch.load('semeval_contents_mask_tensor.pt')

In [13]:
if model_type == 'bert':
    semeval_set = TensorDataset(text_semeval, mask_semeval, bias_semeval)
else:
    semeval_set = TensorDataset(text_semeval, bias_semeval)

## Model classes

In [14]:
### BiLSTM subclasses/functions
def attention(query, key, value, mask=None, dropout=None):
    """
    Compute 'Scaled Dot Product Attention' from 'Attention is all you need'
    """  
    hidden_size = query.size(-1)
    attention_scores = torch.matmul(query, key.transpose(-2, -1)) / np.sqrt(hidden_size)
    # if mask is not None:
    #     scores = scores.masked_fill(mask == 0, -1e9)
    attention_weights = F.softmax(attention_scores, dim = -1)
    if dropout is not None:
        attention_weights = dropout(attention_weights)
    return torch.matmul(attention_weights, value), attention_weights

class LSTMBlock(nn.Module):
    """
    Builds block consisting of an LSTM and a fully connected layer
    """
    def __init__(self, hidden_size, dropout_prob, apply_attention=False, last_layer=False, bidirectional=True):
        """
        hidden_size: Number of hidden neurons in LSTM and number of feature inputs (size of embedding/size of 
                     hidden layer output in stacked LSTM )
        last_layer: Indicates whether to add "boom" layer (False) or not (True)
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.directions = 1 + bidirectional
        self.last_layer = last_layer
        self.apply_attention = apply_attention
        
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True, bidirectional=bidirectional)

        self.layer_norm_input = nn.LayerNorm(hidden_size)
        self.layer_norm_lstm = nn.LayerNorm(self.directions*hidden_size)
        
        if apply_attention:
            self.layer_norm_querry = nn.LayerNorm(self.directions*hidden_size, eps=1e-12)
            self.layer_norm_key = nn.LayerNorm(self.directions*hidden_size)
            self.layer_norm_value = nn.LayerNorm(self.directions*hidden_size)

            self.qs = nn.Parameter(torch.zeros(size=(1, 1, self.directions*hidden_size), dtype=torch.float))
            self.ks = nn.Parameter(torch.zeros(size=(1, 1, self.directions*hidden_size), dtype=torch.float))
            self.vs = nn.Parameter(torch.zeros(size=(1, 1, self.directions*hidden_size), dtype=torch.float))

            self.querry_projection = nn.Linear(self.directions*hidden_size,self.directions*hidden_size)
            self.linear_over_param = nn.Linear(self.directions*hidden_size, 2*self.directions*hidden_size) 

        if not last_layer: 
            self.dropout = nn.Dropout(dropout_prob)
            self.boom_a = nn.Linear(hidden_size*self.directions,4*self.directions*hidden_size) 
            self.activation = nn.GELU() 
            self.boom_b = nn.Linear(4*self.directions*hidden_size, hidden_size) 
    
    def forward(self, feature_input):
        ### Adjust batch size in case of last batch being shorter 
        current_batch_size = feature_input.shape[0]
        # Initilize hidden and cell state
        hidden_0 = torch.zeros(self.directions, current_batch_size, self.hidden_size).to(device)
        # Applying layer normalization to input
        feature_input = self.layer_norm_input(feature_input)
        # LSTM layer with embeddings/hidden-state inputs 
        lstm_out, (last_hidden,last_cell) = self.lstm(feature_input, (hidden_0,hidden_0))

        if self.apply_attention:
            # Taken from Merity (2019):
            # matrix multiplication and layer normalization on querry
            querry = self.layer_norm_querry(self.querry_projection(lstm_out))
            # only layer normalization on key and value
            key = self.layer_norm_key(lstm_out)
            value = self.layer_norm_value(lstm_out)
            # activation of parameter vectors
            qs, ks, vs = torch.sigmoid(self.qs), torch.sigmoid(self.ks), torch.sigmoid(self.vs) 
            # over parameterizing of value parameter vector, using forget gate and candidate (Merity 2019, 6.4)
            candidate, forget = self.linear_over_param(vs).split(self.directions*self.hidden_size, dim=-1) 
            vs = torch.sigmoid(forget) * torch.tanh(candidate) 
            # multiplaying parameter vectors with querry, key, and value respectively
            q, k, v, = qs*querry, ks*key, vs*value 
            # apply scaled dot product attention
            lstm_out, attention_weights = attention(q,k,v, dropout=self.dropout)

        # Applying layer normalization to lstm output
        lstm_out = self.layer_norm_lstm(lstm_out)

        if self.last_layer:
            return lstm_out, last_hidden
        else:
            # big fully connected layer taking shape(batch, seq_len, num_directions * hidden_size) 
            # and returning shape(batch, seq_len, hidden_size)
            boom_out = self.boom_b(self.dropout(self.activation(self.boom_a(lstm_out))))
            return boom_out

### Choosing weither bert or bilstm is model used
if model_type == 'bert':
    class Model(nn.Module):
        def __init__(self, hidden_size, num_labels, droput_prob, bert_model_module, output_attentions=False, pooled_output = True):
            super().__init__()
            self.hidden_size = hidden_size
            self.num_labels = num_labels
            self.output_attentions = output_attentions
            self.pooled_output = pooled_output

            self.bert = bert_model_module
            self.dropout = nn.Dropout(dropout_prob)
            self.linear = nn.Linear(hidden_size,hidden_size)
            self.tanh = nn.Tanh()

            self.classifier_layer = nn.Linear(hidden_size, num_labels) # The values are initialized from U(−sqrt(k),sqrt(k)), where k=1/in_features

        def forward(self, text, mask):
            # token_type_ids and position_ids are created automaticly 
            bert_out = self.bert(input_ids = text, attention_mask = mask)

            if self.pooled_output:
                ### Pooled Output
                # Choosing only CLS token output and apply linear layer + TanH 
                pooled_out = bert_out[1]
                # Applying dropout
                pooled_out = self.dropout(pooled_out)

                out = self.classifier_layer(pooled_out)
            else:
                ### Last Layer average
                # summing up over sequence lenght and devide by unmasked sequence length 
                # resulting in tensor with shape (batch_size,hidden_size)
                last_layer = torch.sum(bert_out[0], dim=1)/torch.sum(mask,dim=1).reshape([len(mask),1])
                last_layer = self.tanh(self.linear(last_layer))
                last_layer = self.dropout(last_layer)
                out = self.classifier_layer(last_layer)

            # Saving attention layer outputs if set True
            if self.output_attentions:
                out = out, bert_out[2]

            return out
        
else:
    class Model(nn.Module):
        def __init__(self, seq_length, hidden_size, num_labels, num_layers, 
                     vocabulary_size, dropout_prob = 0.1, bidirectional = True, attention_layer=False):
            """
            seq_length: Length of input sequence (Text) NOT USED
            hidden_size (==embedding_size): Number or hidden neurons in LSTM, also used for embedding size
            num_labels: Number of target labels
            """
            super().__init__()
            self.hidden_size = hidden_size
            self.num_labels = num_labels
            self.num_layers = num_layers
            self.directions = 1 + bidirectional
            self.attention_layer = attention_layer

            self.embedding = nn.Embedding(vocabulary_size, hidden_size, padding_idx=0)

            self.blocks = nn.ModuleList()
            for i in range(num_layers):
                # last layer
                if i==num_layers-1:
                    self.blocks.append(LSTMBlock(hidden_size, dropout_prob=0, last_layer=True)) 
                # second last layer with attention
                elif i==num_layers-2:
                    self.blocks.append(LSTMBlock(hidden_size, dropout_prob=dropout_prob, apply_attention=self.attention_layer))
                # other layers
                else:
                    self.blocks.append(LSTMBlock(hidden_size, dropout_prob=dropout_prob)) 


            self.dropout = nn.Dropout(dropout_prob)
            self.classifier_a = nn.Linear(hidden_size*self.directions,4*self.directions*hidden_size) 
            self.activation = nn.GELU() # nn.Tanh()
            self.classifier_b = nn.Linear(4*self.directions*hidden_size, hidden_size) 
            self.classifier_c = nn.Linear(hidden_size, num_labels) 

        def forward(self, text):
            ### Embeddings
            embeddings = self.embedding(text)

            ### LSTM + "Boom"-layer blocks
            for i,block in enumerate(self.blocks):
                # only single layer
                if len(self.blocks)==1:
                    last_hidden = block(embeddings)
                # first layer
                elif i==0:
                    block_out = block(embeddings)
                # last layer
                elif i==len(self.blocks)-1:
                    lstm_out, last_hidden = block(block_out)
                # other layers
                else:
                    block_out = block(block_out)

            if self.directions==2:
                # adjust last hidden state output to shape (batch_size,directions*hidden_size), i.e. concatinating both directions
                last_hidden = torch.cat((last_hidden[0,:,:],last_hidden[1,:,:]), axis=1) 

            ### Classifier layer
            output = self.classifier_b(self.dropout(self.activation(self.classifier_a(last_hidden))))
            output = self.classifier_c(output)
            return output

## Preparing model

In [15]:
if model_type == 'bert':
    BertModel = transformers.BertModel

    ### Device to run model on, either GPU or CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    ### Model inputs
    hidden_size = 768
    num_labels = 5 
    dropout_prob = 0.1
    cost_sensitive = False

    ### Hyperparameters
    batch_size = 16 
    learning_rate = 2e-5

    bert_model = BertModel.from_pretrained('bert-base-uncased') 
    model = Model(hidden_size, num_labels, dropout_prob, bert_model, pooled_output=True).to(device)
else:
    ### Device to run model on, either GPU or CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    ### Model inputs
    seq_length = 500 
    hidden_size = 512
    num_labels = 5
    num_layers = 4
    vocabulary_size = 30522
    dropout_prob = 0.1
    bidirectional = True
    attention_layer = True
    ### Hyperparameters
    batch_size = 64
    
    model = Model(seq_length, hidden_size, num_labels, num_layers, vocabulary_size, 
                  dropout_prob, bidirectional, attention_layer).to(device)


## Predicting on SemEval set

In [17]:
results_per_run = np.zeros((2,3))

for run in range(1,4):
    if model_type == 'bert':
        checkpoint = torch.load(os.path.join(repo_path,'deep_learning_models' , 'weights', f'amp_checkpoint_{file_name}_rerun_{run}_epoch3.pt'))
    else:
        torch.load(os.path.join(repo_path, 'deep_learning_models', 'dl_benchmark_weights', f'amp_checkpoint_{file_name}_rerun_{run}_epoch5.pt'))

    model.load_state_dict(checkpoint['model'])

    ### Get predicted values 
    model.eval()
    start_time = time.time()

    data = DataLoader(semeval_set, batch_size=batch_size)
    test_loss = 0
    test_predicted_values = []

    batch_counter = 0
    
    if model_type == 'bert':
        for text, mask, label in data:
            text, mask, label = text.to(device), mask.to(device), label.to(device)

            with torch.no_grad():
                output = model(text,mask)

                test_predicted_values.append(output.argmax(1))
    else:
        for text, label in data:
            text, label = text.to(device), label.to(device)

            with torch.no_grad():
                output = model(text)

                test_predicted_values.append(output.argmax(1))        
                
    print(f'Done. Took {(time.time()-start_time):.1} seconds')
    
    test_predicted_values = torch.cat(test_predicted_values).cpu().numpy()
    
    # converting allsides labels (5) to semeval labels (2)
    predicted_values_semeval_labels = np.zeros(test_predicted_values.shape)

    predicted_values_semeval_labels[test_predicted_values==0] = 1
    predicted_values_semeval_labels[test_predicted_values==1] = 0
    predicted_values_semeval_labels[test_predicted_values==2] = 0
    predicted_values_semeval_labels[test_predicted_values==3] = 0
    predicted_values_semeval_labels[test_predicted_values==4] = 1
    
    # accuracy and F1 score of semeval predictions 
    test_accuracy = (predicted_values_semeval_labels==bias_semeval.numpy()).sum()/len(predicted_values_semeval_labels) 
    test_f1_score = f1_score(bias_semeval.numpy(), predicted_values_semeval_labels)
    print(f'Accuracy: {test_accuracy:.4}   F1-Score: {test_f1_score:.4}')
    results_per_run[0,run-1] = test_accuracy
    results_per_run[1,run-1] = test_f1_score
    


Done. Took 2e+01 seconds
Accuracy: 0.6698   F1-Score: 0.5535
Done. Took 2e+01 seconds
Accuracy: 0.6574   F1-Score: 0.5641
Done. Took 2e+01 seconds
Accuracy: 0.6946   F1-Score: 0.6175


In [18]:
average_results = np.mean(results_per_run, axis=1)
print(f'Acc: {average_results[0]:.4} and F1: {average_results[1]:.4} of {file_name}')

Acc: 0.6739 and F1: 0.5783 of allsides_aggregators_tabloids_duplicates_removed


In [19]:
# creating random predictions according to true class distribution
prob_non_partisan = np.unique(bias_semeval.cpu().numpy(), return_counts=True)[1][0]/len(bias_semeval)
prob_hyperpartisan = np.unique(bias_semeval.cpu().numpy(), return_counts=True)[1][1]/len(bias_semeval)

random_results_per_run = np.zeros((2,3))

for i in range(3):
    random_predictions = np.random.choice([0,1], size=len(bias_semeval), p=[prob_non_partisan, prob_hyperpartisan])
    # get results for random predictions 
    random_accuracy = (random_predictions==bias_semeval.numpy()).sum()/len(predicted_values_semeval_labels) 
    random_f1_score = f1_score(bias_semeval.numpy(), random_predictions)
    print(f'Accuracy: {random_accuracy:.4}   F1-Score: {random_f1_score:.4}')
    random_results_per_run[0, i] = random_accuracy
    random_results_per_run[1, i] = random_f1_score
    
average_random_results = np.mean(random_results_per_run, axis=1)
print(f'Acc: {average_random_results[0]:.4} and F1: {average_random_results[1]:.4} of random runs')

Accuracy: 0.4946   F1-Score: 0.3122
Accuracy: 0.5178   F1-Score: 0.3561
Accuracy: 0.5349   F1-Score: 0.3534
Acc: 0.5158 and F1: 0.3406 of random runs
