## Media bias prediction model

### Setting up environment

In [2]:
##### Importing packages
import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import transformers
from sklearn.metrics import precision_score, recall_score, f1_score

In [1]:
from apex import amp

In [0]:
##### Choosing working directory
#os.chdir('/content/gdrive/My Drive/thesis')

In [3]:
### Getting GPU type
print(torch.cuda.get_device_name(0))
if torch.cuda.is_available():
    print('Is available')
else:
    print('is not available')

Tesla T4
Is available


### Loading and preparing data

In [4]:
##### Loading tensors
bias_train = torch.load('bias_train.pt')
bias_val = torch.load('bias_val.pt')
text_train = torch.load('contents_text_train.pt')
text_val = torch.load('contents_text_val.pt')
mask_train = torch.load('contents_mask_train.pt')
mask_val = torch.load('contents_mask_val.pt')
# text_train = torch.load('contents_text_source_removed_train.pt')
# text_val = torch.load('contents_text_source_removed_val.pt')
# mask_train = torch.load('contents_mask_source_removed_train.pt')
# mask_val = torch.load('contents_mask_source_removed_val.pt')

In [5]:
##### Creating training and validation sets for pytorch models
train_set = TensorDataset(text_train, mask_train, bias_train)
val_set = TensorDataset(text_val, mask_val, bias_val)


### Model Class

In [6]:
##### Create Model class

class Model(nn.Module):
    def __init__(self, hidden_size, num_labels, droput_prob, bert_model_module, output_attentions=False):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.output_attentions = output_attentions

        self.bert = bert_model_module
        self.dropout = nn.Dropout(dropout_prob)
        self.classifier_layer = nn.Linear(hidden_size, num_labels) # The values are initialized from U(−sqrt(k),sqrt(k)), where k=1/in_features

    def forward(self, text, mask):
        # token_type_ids and position_ids are created automaticly 
        bert_out = self.bert(input_ids = text, attention_mask = mask)
        # Choosing only CLS token output and apply linear layer + TanH 
        pooled_out = bert_out[1]
        # Applying dropout
        pooled_out = self.dropout(pooled_out)
        # Add classifier
        out = self.classifier_layer(pooled_out)
        # Saving attention layer outputs if set True
        if self.output_attentions:
          out = out, bert_out[2]
        
        return out

### Train Function

In [7]:
##### Function for training of 1 epoch

def train_fct(train_set, batch_size, batch_feedback = 500, first_check = 10, mixed_precision = False):
    start_time = time.time()
    # Setting model to train mode (so dropout is applied)
    model.train()
    # creating iterable dataset devided into batches and shuffled
    data = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    # tracking batches, loss, accuracy
    batch_counter = 0
    train_loss = 0
    train_correctly_specified = 0
    train_predicted_values = []
    train_true_values = []
    # looping over batches
    for text, mask, label in data:
        # sending tensors to GPU
        text, mask, label = text.to(device), mask.to(device), label.to(device)
        # clearing gradients
        optimizer.zero_grad()
        # run through model
        output = model(text, mask)
        # calculating loss
        loss = loss_fct(output, label)
        # backpropagation
        if mixed_precision:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        
        # updating weights
        optimizer.step()

        # loss and metrices messures
        train_loss += loss.item()
        train_correctly_specified += (output.argmax(1) == label).sum().item()
        
        train_predicted_values.append(output.argmax(1))
        train_true_values.append(label)
        
        # adding to batchcounter
        batch_counter += 1

        if (batch_counter % batch_feedback == 0) or (batch_counter == first_check):
            time_so_far = time.time() - start_time
            minutes = time_so_far // 60
            seconds = time_so_far % 60
            average_progress_loss = train_loss/batch_counter
            progress_acc = train_correctly_specified/(batch_counter*batch_size)
            print('---------------------------------------')
            print('%d batches done after %d min %d sec'%(batch_counter,minutes,seconds))
            print('---------------------------------------')
            print('loss: %.4f \t|\tacc: %.4f'%(average_progress_loss, progress_acc))
            print('---------------------------------------')
    
    # loss
    average_total_loss = train_loss/(len(train_set)/batch_size)
    # accuracy
    total_accuracy = train_correctly_specified/len(train_set) 
    # Predicted and true values
    train_predicted_values = torch.cat(train_predicted_values).cpu().numpy()
    train_true_values = torch.cat(train_true_values).cpu().numpy()
    # Precision
    train_precision = precision_score(train_true_values, train_predicted_values, average='macro')
    # Recall
    train_recall = recall_score(train_true_values, train_predicted_values, average='macro')
    # F1 score
    train_f1_score = f1_score(train_true_values, train_predicted_values, average='macro')
    
    return average_total_loss, total_accuracy, train_precision, train_recall, train_f1_score


### Validation Function

In [8]:
##### Function for validation after 1 epoch of training

def val_fct(val_set, batch_size):
    # Setting model to evaluation mode (dropout is not applied)
    model.eval()
    # creating iterable dataset devided into batches, not shuffeled
    data = DataLoader(val_set, batch_size = batch_size)
    # setting up loss and accuracy variables
    val_loss = 0
    #val_correctly_specified = 0
    val_predicted_values = []
    val_true_values = []
    # looping over batches
    for text, mask, label in data:
        text, mask, label = text.to(device), mask.to(device), label.to(device)
        # no gradient calculation during validation
        with torch.no_grad():
            output = model(text,mask)
            loss = loss_fct(output, label)

            val_loss += loss.item()
            #val_correctly_specified += (output.argmax(1) == label).sum().item()
            val_predicted_values.append(output.argmax(1))
            val_true_values.append(label)
    
    # loss
    average_val_loss = val_loss/(len(val_set)/batch_size)
    # true and predicted values
    val_predicted_values = torch.cat(val_predicted_values).cpu().numpy()
    val_true_values = torch.cat(val_true_values).cpu().numpy()
    # Accuracy
    val_accuracy = (val_predicted_values==val_true_values).sum().item()/len(val_set) #val_correctly_specified/len(val_set)
    # Precision
    val_precision = precision_score(val_true_values, val_predicted_values, average='macro')
    # Recall
    val_recall = recall_score(val_true_values, val_predicted_values, average='macro')
    # F1 score
    val_f1_score = f1_score(val_true_values, val_predicted_values, average='macro')

    return average_val_loss, val_accuracy, val_precision, val_recall, val_f1_score
    

### Preparing Model

In [9]:
##### Loading Bert 
BertModel = transformers.BertModel
# BertTokenizer = transformers.BertTokenizer
# bert_pretrained_weights = 'bert-base-uncased'

### Device to run model on, either GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Model inputs
hidden_size = 768
num_labels = 3
dropout_prob = 0.1
### Batch size for training and validation
batch_size = 16

In [10]:
##### Initilize and configure Bert
bert_model = BertModel.from_pretrained('bert-base-uncased') 

##### Initilize model
model = Model(hidden_size, num_labels, dropout_prob, bert_model).to(device)

### Optimizer, choosing learning rate
optimizer = torch.optim.Adam(model.parameters(), lr = 0.00002)

### Applying mixed precision to speed up model training
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")


### Loss function
loss_fct = nn.CrossEntropyLoss().to(device)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


### Model Training

In [11]:
### Choosing number of epochs to train
num_epochs = 1

### Lists to save metrices
train_loss_list = []
val_loss_list = []
train_acc_list = []
val_acc_list = []
train_precision_list = []
val_precision_list = []
train_recall_list = []
val_recall_list = []
train_f1_list = []
val_f1_list = []

for epoch in range(num_epochs):
    epoch_start_time = time.time()
    # Training for 1 epoch
    train_loss, train_acc, train_precision, train_recall, train_f1_score = train_fct(train_set, 
                                                                                     batch_size, 
                                                                                     batch_feedback=2000, 
                                                                                     first_check=100, 
                                                                                     mixed_precision = True)
    # Validation
    val_loss, val_acc, val_precision, val_recall, val_f1_score = val_fct(val_set, batch_size)
    # saving metrices
    train_loss_list.append(train_loss)
    val_loss_list.append(val_loss)
    train_acc_list.append(train_acc)
    val_acc_list.append(val_acc)
    train_precision_list.append(train_precision)
    train_recall_list.append(train_recall)
    val_precision_list.append(val_precision)
    val_recall_list.append(val_recall)
    train_f1_list.append(train_f1_score)
    val_f1_list.append(val_f1_score)

    end = time.time() - epoch_start_time
    minutes = end // 60
    seconds = end % 60

    print('Epoch: %d took %d min, %d sec' %((epoch + 1), minutes, seconds))
    print('(Training)   Loss: %.4f  |  Acc: %.4f  |  F1: %.4f' %(train_loss, train_acc, train_f1_score))
    print('(Validation) Loss: %.4f  |  Acc: %.4f  |  F1: %.4f' %(val_loss, val_acc, val_f1_score))


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
---------------------------------------
100 batches done after 1 min 14 sec
---------------------------------------
loss: 0.9439 	|	acc: 0.5175
---------------------------------------
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
---------------------------------------
1000 batches done after 12 min 25 sec
---------------------------------------
loss: 0.7179 	|	acc: 0.6665
---------------------------------------
---------------------------------------
2000 batches done after 24 min 51 sec
---------------------------------------
loss: 0.6093 	|	acc: 0.7255
---------------------------------------
---------------------------------------
3000 batches done after 37 min 17 sec
---------------------------------------
loss: 0.5484 	|	acc: 0.7570
---------------------------------------
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
------------------

From the GitHub Discussion regarding gradient overflow: "Occasionally seeing a message like “overflow detected, skipping step, reducing loss scale” is normal behavior with dynamic loss scaling, and it usually happens in the first few iterations because Amp begins by trying a high loss scale."

In [12]:
### Results
print('\t Loss \t\t Accuracy \t\t Precision \t\tRecall\t\t F1 Score')
print(train_loss_list, train_acc_list, train_precision_list, train_recall_list, train_f1_list)
print(val_loss_list, val_acc_list, val_precision_list, val_recall_list, val_f1_list)

##### Colab
### source removed, lr 0.00001
# Epoch 1 validation: 0.020134240149200136 0.8713280926768722 0.8558285658209931 0.8843389771463251 0.8677143693730516

# Epoch 2 training: [0.01456093337804093] [0.9087265330046373] [0.9054350576361471] [0.9043001074317857] [0.904841879952636]
# Epoch 2 validation: [0.0153729185697558] [0.9068749138049924] [0.9036977666780507] [0.9054265988915527] [0.9044335065693452]
# Epoch 3 training:[0.16084885144930583] [0.9383738169531264] [0.9353458111355638] [0.935262652754551] [0.9352988023156548]
# Epoch 3 validation: [0.24885340416995333] [0.9111846641842505] [0.900652469742982] [0.9150473003246713] [0.9074495190199744]
# Epoch 4 training: [0.11022477680434647] [0.9591945799644871] [0.9561934008096636] [0.9566272381119848] [0.9564094295953569]
# Epoch 4 validation:[0.27568220538593885] [0.9179768307819611] [0.9112980287133338] [0.9177220353054366] [0.9143656261644293]
# Epoch 5 training:[0.07788220056710543] [0.9718827038116089] [0.9690285752575499] [0.9698152477910802] [0.9694206627426398]
# Epoch 5 validation:[0.2727730815459278] [0.9158047165908151] [0.9057021897800093] [0.9187037798215177] [0.9118923921686556]

### train without, val with source in text
# Epoch 1 training: [0.3971200964916292] [0.8324641853569397] [0.828299376800027] [0.8212768603546242] [0.8245316879281454]
# Epoch 1 validation:[0.4229980396116344] [0.8383671217763067] [0.8349940151450616] [0.828457246274632] [0.8306645900549009]

### lr=0.00002
# Epoch 1 training: [0.37530845116653017] [0.8436223214439637] [0.8381188959089543] [0.8349232930591916] [0.8363652803688476]
# Epoch 2 validation: [0.2957536905301125] [0.8842918218176803] [0.8621790465931204] [0.8923710220376032] [0.875162090160778]

##### GCP
### lr=0.00002, T4, Apex
# Epoch 1 training: [0.38075399172641655] [0.8413855224369473] [0.8365040967251182] [0.8332343787158448] [0.8346977454141338]
# Epoch 1 validation: [0.28176233467110734] [0.8876706661150187] [0.8709227602050201] [0.8956481279037334] [0.8821069317241093]

	 Loss 		 Accuracy 		 Precision 		Recall		 F1 Score
[0.38075399172641655] [0.8413855224369473] [0.8365040967251182] [0.8332343787158448] [0.8346977454141338]
[0.28176233467110734] [0.8876706661150187] [0.8709227602050201] [0.8956481279037334] [0.8821069317241093]


### Saving/Loading Model

#### Without Apex

In [0]:
### Saving model weights
# torch.save(model.state_dict(), 'weights/model_weights_epoch1_lr0.00002.pt')

In [0]:
### Loading model weights
# model.load_state_dict(torch.load('weights/model_weights_train_no_source_val_with_source_epoch1.pt'))

<All keys matched successfully>

#### With Apex

In [13]:
### Saving checkpoint
checkpoint = {'model': model.state_dict(),
              'optimizer': optimizer.state_dict(),
              'amp': amp.state_dict()}

torch.save(checkpoint, 'weights/amp_checkpoint.pt')

In [None]:
### Restoring
checkpoint = torch.load('weights/amp_checkpoint.pt')

model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
amp.load_state_dict(checkpoint['amp'])

In [None]:
### Stopping instance
! gcloud compute instances stop t4-instance --zone=europe-west4-c

### Testing validation adjustments

In [0]:
###### Testing different things in validation
model.eval()
data = DataLoader(val_set, batch_size = batch_size)
test_loss = 0
test_predicted_values = []
test_true_values = []

for text, mask, label in data:
    text, mask, label = text.to(device), mask.to(device), label.to(device)

    with torch.no_grad():
        output = model(text,mask)
        loss = loss_fct(output, label)

        try:
          test_probabilities = torch.cat([test_probabilities, output])
        except NameError:
          test_probabilities = output

        test_loss += loss.item()
        test_predicted_values.append(output.argmax(1))
        test_true_values.append(label)


In [0]:
##### For Testing of validation
# loss
average_test_loss = test_loss/(len(val_set)/batch_size)
# true and predicted values
#test_predicted_values = torch.cat(test_predicted_values).cpu().numpy()
#test_true_values = torch.cat(test_true_values).cpu().numpy()
# Accuracy
test_accuracy = (test_predicted_values==test_true_values).sum().item()/len(val_set) #val_correctly_specified/len(val_set)
# Precision
test_precision = precision_score(test_true_values, test_predicted_values, average='macro')
# Recall
test_recall = recall_score(test_true_values, test_predicted_values, average='macro')
# F1 score
test_f1_score = f1_score(test_true_values, test_predicted_values, average='macro')

test_probabilities = F.softmax(test_probabilities, dim=1).cpu()

In [0]:
print(average_test_loss, test_accuracy, test_f1_score )

0.26215972163737716 0.899669011170873 0.895641877338377


### Exploring and testing model results

In [0]:
# Loading and initilizing tokenizer
BertTokenizer = transformers.BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




#### Having a look at attention layers
For this to work, output_attentions=True needs to be chosen during loading of pretrained Bert weights and when initilizing own model.

In [0]:
### Load model weights with configuration to output attentions
bert_model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True) 

##### Initilize model version that handles attention outptu
model = Model(hidden_size, num_labels, dropout_prob, bert_model, output_attentions=True).to(device)


In [0]:
##### Using code from val_fct and adjust it to attention output
model.eval()
data = DataLoader(val_set, batch_size = batch_size)
val_loss = 0
val_predicted_values = []
val_true_values = []

for text, mask, label in data:
    text, mask, label = text.to(device), mask.to(device), label.to(device)

    with torch.no_grad():
        # model outputs predictions and attention layers 
        output = model(text,mask)
        model_output = output[0]
        attentions = output[1]

        loss = loss_fct(model_output, label)

        # try:
        #   val_probabilities = torch.cat([test_probabilities, output])
        # except NameError:
        #   val_probabilities = output

        val_loss += loss.item()
        val_predicted_values.append(model_output.argmax(1))
        val_true_values.append(label)
    
    ### Get only first batch of attentions
    break


In [0]:
### Get input tensor with tokens
att_tokens = np.array(bert_tokenizer.convert_ids_to_tokens(text[0,:].cpu()))
### extract single attention weight matrices
attentions[0][0,0,:,:].cpu() # [layer?][batch,head,sequence,sequence]

#### Sources

In [0]:
# Show digits without scientific display
np.set_printoptions(suppress=True)

In [0]:
# Loading Sources of validation set
source_val = np.load('source_val.npy')

In [0]:
# Dictionaries for source convertion
source_dict = {'Addicting Info': 0, 'Al Jazeera': 1, 'Alternet': 2, 'BBC': 3, 'Bearing Arms': 4, 'Bipartisan Report': 5, 'Breitbart': 6, 'Business Insider': 7, 'CNBC': 8, 'CNN': 9, 'CNS News': 10, 'Crooks and Liars': 11, 'DC Gazette': 12, 'Daily Beast': 13, 'Daily Kos': 14, 'Daily Mail': 15, 'Daily Signal': 16, 'Daily Stormer': 17, 'Drudge Report': 18, 'Feministing Blog': 19, 'FiveThirtyEight': 20, 'Foreign Policy': 21, 'Forward Progessives': 22, 'Fox News': 23, 'Freedom Daily': 24, 'FrontPage Magazine': 25, 'Hot Air': 26, 'Infowars': 27, 'Investors Business Daily': 28, 'LewRockwell': 29, 'MSNBC': 30, 'Media Matters for America': 31, 'MotherJones': 32, 'NPR': 33, 'National Review': 34, 'New York Daily News': 35, 'New York Post': 36, 'New Yorker': 37, 'Newswars': 38, 'Newsweek': 39, 'PBS': 40, 'Palmer Report': 41, 'Pamela Geller Report': 42, 'Pink News UK': 43, 'Politico': 44, 'Politicus USA': 45, 'Pravada Report': 46, 'Raw Story': 47, 'Real Clear Politics': 48, 'RedState': 49, 'Reuters': 50, 'Salon': 51, 'Shadow Proof': 52, 'Shareblue': 53, 'Slate': 54, 'Talking Points Memo': 55, 'Telesur TV': 56, 'The D.C. Clothesline': 57, 'The Daily Caller': 58, 'The Daily Express': 59, 'The Daily Mirror': 60, 'The Daily Record': 61, 'The Duran': 62, 'The Gateway Pundit': 63, 'The Hill': 64, 'The Huffington Post': 65, 'The Intercept': 66, 'The Michelle Malkin Blog': 67, 'The Political Insider': 68, 'The Right Scoop': 69, 'The Sun': 70, 'The Washington Examiner': 71, 'TheBlaze': 72, 'ThinkProgress': 73, 'True Activist': 74, 'USA Today': 75, 'Vox': 76, 'Washington Monthly': 77, 'Western Journal': 78, 'Yahoo News': 79}
source_dict_inverse = {0: 'Addicting Info', 1: 'Al Jazeera', 2: 'Alternet', 3: 'BBC', 4: 'Bearing Arms', 5: 'Bipartisan Report', 6: 'Breitbart', 7: 'Business Insider', 8: 'CNBC', 9: 'CNN', 10: 'CNS News', 11: 'Crooks and Liars', 12: 'DC Gazette', 13: 'Daily Beast', 14: 'Daily Kos', 15: 'Daily Mail', 16: 'Daily Signal', 17: 'Daily Stormer', 18: 'Drudge Report', 19: 'Feministing Blog', 20: 'FiveThirtyEight', 21: 'Foreign Policy', 22: 'Forward Progessives', 23: 'Fox News', 24: 'Freedom Daily', 25: 'FrontPage Magazine', 26: 'Hot Air', 27: 'Infowars', 28: 'Investors Business Daily', 29: 'LewRockwell', 30: 'MSNBC', 31: 'Media Matters for America', 32: 'MotherJones', 33: 'NPR', 34: 'National Review', 35: 'New York Daily News', 36: 'New York Post', 37: 'New Yorker', 38: 'Newswars', 39: 'Newsweek', 40: 'PBS', 41: 'Palmer Report', 42: 'Pamela Geller Report', 43: 'Pink News UK', 44: 'Politico', 45: 'Politicus USA', 46: 'Pravada Report', 47: 'Raw Story', 48: 'Real Clear Politics', 49: 'RedState', 50: 'Reuters', 51: 'Salon', 52: 'Shadow Proof', 53: 'Shareblue', 54: 'Slate', 55: 'Talking Points Memo', 56: 'Telesur TV', 57: 'The D.C. Clothesline', 58: 'The Daily Caller', 59: 'The Daily Express', 60: 'The Daily Mirror', 61: 'The Daily Record', 62: 'The Duran', 63: 'The Gateway Pundit', 64: 'The Hill', 65: 'The Huffington Post', 66: 'The Intercept', 67: 'The Michelle Malkin Blog', 68: 'The Political Insider', 69: 'The Right Scoop', 70: 'The Sun', 71: 'The Washington Examiner', 72: 'TheBlaze', 73: 'ThinkProgress', 74: 'True Activist', 75: 'USA Today', 76: 'Vox', 77: 'Washington Monthly', 78: 'Western Journal', 79: 'Yahoo News'}


In [0]:
# Convertion of texts from ids to tokens
bert_tokenizer.convert_ids_to_tokens('Sequence of Tokens')

#### Predicting on other articles (copy past)

In [0]:
##### Testing other articles
new_article = 'Donald Trump once lied again! It is incredible how stupid that man is and the similarly stupid GOP keeps backing him. Republican Senator Lindsey Graham said "The Democrats have no idea what they are talking about. '

# encode article to get mask and encoded text in form of a dictionary
new_article_dict = bert_tokenizer.encode_plus(new_article, max_length= 500, pad_to_max_length=True, return_tensors='pt', return_token_type_ids=False)
new_text = new_article_dict['input_ids'].to(device)
new_mask = new_article_dict['attention_mask'].to(device)

In [0]:
# Apply model to new article
with torch.no_grad():
        output = model(new_text, new_mask)

probabilities = F.softmax(output, dim=1).cpu().numpy().reshape(-1)*100
print('left: %.2f%% center: %.2f%% right: %.2f%%'% (probabilities[0], probabilities[1], probabilities[2]))

left: 95.33% center: 0.30% right: 4.36%
