In [1]:
import torch

# verify if GPU is available
if torch.cuda.is_available(): 
    
    device = torch.device("cuda")    
    print('There are %d GPU(s) available.' % torch.cuda.device_count())    
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
else:
    
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [5]:
import pandas as pd
import mysql.connector
import os
from dotenv import load_dotenv



# initialize variables

load_dotenv(override=True)

# mysql credentials
PASSWORD = os.getenv("PASSWORD")
USER = os.getenv("USER")


# df = pd.read_csv("privacy.csv") # REPLACE with data from database (title + content)
# print('Number of training sentences: {:,}\n'.format(df.shape[0]))
# df.sample(10)

In [6]:
# connect to db
import mysql.connector

mydb = mysql.connector.connect(
  host="127.0.0.1",
  user=USER,
  password=PASSWORD,
  database="mpp21"
)

mycursor = mydb.cursor()

In [8]:
# import DATA
# read newspaper info 
import csv

newspapers = {}

with open('./newspapers-collected.csv', 'r') as f:
        reader = csv.reader(f)
        for r in reader:
            newspapers[r[0]] = r[1]

In [19]:
article_ids = {}

with open('./labels.csv', 'r') as f:
        reader = csv.reader(f)
        for r in reader:
            article_ids[r[0]] = r[1]
            
del article_ids["aid"]

In [10]:
df = pd.read_csv("labels.csv") # REPLACE with data from database (title + content)
print('Number of training sentences: {:,}\n'.format(df.shape[0]))
df.sample(10)

Number of training sentences: 433



Unnamed: 0,aid,label
28,TDP_1592,0
17,NYT_1547,0
267,AFR_1961,1
247,USA_1111,0
159,NZH_1826,0
317,TDP_1379,1
366,NYT_1307,1
199,TDP_1129,0
351,DT_2440,1
45,AFR_1277,0


In [37]:
# TEST IF ONLY ABSTRACT
# db methods
def select_article(curs, db, aid):
    
    curs.execute("""SELECT title, content 
                FROM articles 
                WHERE article_id = '""" + aid + """' 
                """)
                #LIMIT 200""") 
    
    arts = {}
    result = curs.fetchall()
    for r in result:
        arts[aid] = {
            "title": r[0],
            "content": r[1],
            "verdict": article_ids[aid]
        }
    
    return arts[aid]

In [218]:
contents = {}

for a in article_ids.keys():
    temp = select_article(mycursor, mydb, a)
    
    contents[a] = {
        "id": a,
        "label": int(temp["verdict"]),
        "text": temp["title"] + " " + temp["content"]
    }


In [219]:
df = pd.DataFrame.from_dict(contents).T

In [220]:
df.sample(10)

Unnamed: 0,id,label,text
USA_1653,USA_1653,0,Clinton team acknowledges missteps;Campaign vo...
DT_1866,DT_1866,0,"Whatever your new year's resolution, there's a..."
NYT_1712,NYT_1712,1,Why surveillance doesn't faze Britain FULL TEX...
AFR_1702,AFR_1702,0,'I didn't want to be a battery hen';Co-working...
TS_1389,TS_1389,0,Lost your cellphone? Check the coffee shop It ...
USA_1476,USA_1476,0,"Rubio, Cruz battle to be alternative to Trump;..."
SMH_1505,SMH_1505,0,"Almost cannonball ruin, but Tate finally retur..."
NYT_1238,NYT_1238,1,Dr. Seuss offers a take on privacy;Disruptions...
NYT_1495,NYT_1495,1,"Close the back doors FULL TEXTIn 2006, a feder..."
USA_1600,USA_1600,1,New rules on wellness programs will let employ...


In [221]:
print('Number of training sentences: {:,}\n'.format(df.shape[0]))


Number of training sentences: 433



In [222]:
sentences = df.text.values
labels = df.label.values

In [223]:
from transformers import BertTokenizer

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [224]:
### TODO: get head + tail of article, or first 512 words

# tokenize sentences

input_ids = []

for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'                        
                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        max_length = 512,#64,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    
    input_ids.append(encoded_sent)
    
print("Sample encoding:")
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

print()
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

Sample encoding:
Original:  Fresh twist on British elegance A $30 million renovation at The Langham will replace the chintz with plantation shutters - but many of the original Georgian features loved by guests will stay, writes Samantha Hutchinson.When the BBC spent a year filming behind the scenes at a vaunted hotel in Mayfair, viewers were introduced to some its most enduring guests, many of whom loved the hotel so much they would live in its suites for months on end.While the practice of taking up residence for months at a time may not be a tradition at many of Sydney's coterie of five-star hotels, The Langham reckons it has forged a reputation as a luxury traveller's home away from home.Even at breakfast on a morning in early June, the home-style feeling is on show in the dining room when a guest strolls to the servery in a pair of chequered cashmere pyjama pants and leather and wool slippers."We've always had a reputation for being a little bit English, and we've also had a reputa

In [225]:
from keras.preprocessing.sequence import pad_sequences

# set max sequence length
MAX_LEN = 512#64

print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

# pad input tokens with value 0 at the end ("post")
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")
print('\nDone.')


Padding/truncating all sentences to 512 values...

Padding token: "[PAD]", ID: 0

Done.


In [226]:
# create attention masks
attention_masks = []

for sent in input_ids:
    
    # tokenID == 0 => padding, mask = 0
    # tokenID > 0 => real token, mask = 1
    att_mask = [int(token_id > 0) for token_id in sent]
    
    attention_masks.append(att_mask)

In [227]:
# split data into train and validation sets
from sklearn.model_selection import train_test_split

# 0.9 training, 0.1 validation

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
# analogously for the masks
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=2018, test_size=0.1)

In [228]:
import numpy as np

In [229]:
x = []
y = []

for i in train_labels:
    x.append(i)
    
for j in validation_labels:
    y.append(j)
    
x = np.array(x)
y = np.array(y)

In [230]:
# convert inputs and labels into torch tensors

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

#tensor = torch.from_numpy(array.astype(np.uint8))


train_labels = torch.tensor(x)#torch.from_numpy(train_labels.astype(np.uint8)))
validation_labels = torch.tensor(y)#torch.from_numpy(validation_labels.astype(np.uint8)))

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [231]:
# note changes above by adding torch.from_numpy(array.astype(np.uint8))

In [232]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# set batch size fo training; for fine-tuning BERT on a specific task, 16 or 32 is recommended
batch_size = 32

# DataLoader for training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# DataLoader for validation set
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [233]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# load BertForSequenceClassification (pre-trained BERT model with a single linear classif. layer)

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # 12-layer BERT model, uncased vocab (TODO: uncase articles)
    # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # number of output labels
                        # 2: binary classification
                        # >2: multi-class   
    output_attentions = False, # return attention weights
    output_hidden_states = False, # return hidden states
)

# run on GPU (uncomment if GPU available on server)
# model.cuda()

In [234]:
# get model's parameters
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
    
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [235]:
# AdamW 
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5,
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
from transformers import get_linear_schedule_with_warmup

# training epochs: recommended 2-4
epochs = 4

# training steps = batches * epochs
total_steps = len(train_dataloader) * epochs

# create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            # default value in run_glue.py
                                            num_training_steps = total_steps)

In [236]:
import numpy as np

# compute accuracy of predictions
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [237]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
   
    elapsed_rounded = int(round((elapsed)))

    return str(datetime.timedelta(seconds=elapsed_rounded))

In [238]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# store average loss after each epoch
loss_values = []

for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # full pass over the training set    
    print()
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')    
    t0 = time.time() # measure epoch length
    total_loss = 0 # reset total loss for current epoch
    
    # place model in training mode
    model.train()    

    # for each batch
    for step, batch in enumerate(train_dataloader):        
        
        # updated on progress every 40 batches
        if step % 40 == 0 and not step == 0:

            elapsed = format_time(time.time() - t0) # compute elapsed time
            
            # report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))        
            
        # unpack training batch from DataLoader
        # copy each tensor to GPU ('to' method)
        
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)        
        
        # clear previously calculated gradients before a backward pass
        model.zero_grad()                
        
        # forward pass (i.e., evaluate the model on this training batch)
        # this returns the loss (and not the model output), because we provided the `labels`

        # documentation for model function:
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # get loss value
        loss = outputs[0]        
        
        # accumulate training loss over all batches (to compute average loss at the end)
        total_loss += loss.item()     
        
        # backward pass to calculate gradients
        loss.backward()        
        
        # clip the norm of the gradients to 1.0 to prevent the 'exploding gradients' problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)        
        
        # update parameters (the optimizer dictates the update rule based on gradients, learning rate, etc.)
        # take a step using the computed gradient
        optimizer.step() 
        
        # update the learning rate.
        scheduler.step()    
    
    # calculate average loss over training data
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # store the loss value to plot the learning curve
    loss_values.append(avg_train_loss)    
    
    print()
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    

    # after each training epoch, measure performance on validation set
    print()
    print("Running Validation...")    
    t0 = time.time()   
    
    # place model in evaluation mode (dropout layers behave differently than during training)
    model.eval()    
    
    # tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0    
    
    # evaluate data for one epoch
    for batch in validation_dataloader:
        
        # add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # unpack inputs from DataLoader
        b_input_ids, b_input_mask, b_labels = batch
        
        
        # do not compute or store gradients to save memory and speedup validation
        with torch.no_grad():                    
            # forward pass, calculate logit predictions
                # will return logits (not the loss, because we have not provided labels)
            # token_type_ids = segment ids (differentiates between sentence 1 and 2 in 2-sentence tasks)
            
            
            # documentation for model
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # get logits (output) from model
        # values prior to applying an activation function (e.g., softmax)
        logits = outputs[0]    
        
        # move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # compute accuracy for current batch 
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # accumulate accuracy
        eval_accuracy += tmp_eval_accuracy        
        # track batches
        nb_eval_steps += 1    
        # report final accuracy for current validation run
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
    print("")
print("Training complete!")


Training...

  Average training loss: 0.59
  Training epoch took: 0:07:52

Running Validation...
  Accuracy: 0.91
  Validation took: 0:00:14


Training...

  Average training loss: 0.43
  Training epoch took: 0:07:51

Running Validation...
  Accuracy: 0.88
  Validation took: 0:00:15


Training...

  Average training loss: 0.35
  Training epoch took: 0:08:07

Running Validation...
  Accuracy: 0.80
  Validation took: 0:00:14


Training...

  Average training loss: 0.30
  Training epoch took: 0:07:55

Running Validation...
  Accuracy: 0.91
  Validation took: 0:00:14

Training complete!


In [239]:
import plotly.express as px

f = pd.DataFrame(loss_values)
f.columns=['Loss']
f

Unnamed: 0,Loss
0,0.586277
1,0.433447
2,0.349306
3,0.304175


In [244]:
# VALIDATIOOOOOOOOON


validation_ids = {}

with open('./validation.csv', 'r') as f:
        reader = csv.reader(f)
        for r in reader:
            validation_ids[r[0]] = r[1]
            
del validation_ids["aid"]


In [247]:
# TEST IF ONLY ABSTRACT
# db methods
def select_article2(curs, db, aid):
    
    curs.execute("""SELECT title, content 
                FROM articles 
                WHERE article_id = '""" + aid + """' 
                """)
                #LIMIT 200""") 
    
    arts = {}
    result = curs.fetchall()
    for r in result:
        arts[aid] = {
            "title": r[0],
            "content": r[1],
            "verdict": validation_ids[aid]
        }
    
    return arts[aid]

In [248]:
validation_contents = {}

for a in validation_ids.keys():
    temp = select_article2(mycursor, mydb, a)
    
    validation_contents[a] = {
        "id": a,
        "label": int(temp["verdict"]),
        "text": temp["title"] + " " + temp["content"]
    }

In [249]:
validation_df = pd.DataFrame.from_dict(validation_contents).T

In [289]:
print('Number of test documents: {:,}\n'.format(validation_df.shape[0]))

Number of test documents: 6



In [290]:
import pandas as pd



sentences = validation_df.text.values #df
labels = validation_df.label.values

# tokenize documents
input_ids = []

# encode documents (as during training)
for sent in sentences:
    encoded_sent = tokenizer.encode(
                        sent,  
                        max_length = 512,#64,          # Truncate all sentences.

                        add_special_tokens = True, 
                   )
    
    input_ids.append(encoded_sent)

# pad inputs
input_ids = pad_sequences(input_ids, maxlen=512, #MAX_LEN 
                          dtype="long", truncating="post", padding="post")

# create attention masks
attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)
print(attention_masks[0])
    
# convert to tensors
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)

z = []

for i in labels:
    z.append(i)

z = np.array(z)

prediction_labels = torch.tensor(z)#labels)

# set batch size.  
batch_size = 32  

# create the DataLoader
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,

In [291]:
# prediction on test set

print('Predicting labels for {:,} test documents...'.format(len(prediction_inputs)))

# place model in evaluation mode
model.eval()

# tracking variables 
predictions , true_labels = [], []

# predict 
for batch in prediction_dataloader:
    # add batch to GPU (if available)
    batch = tuple(t.to(device) for t in batch)
  
    # unpack inputs from DataLoader
    b_input_ids, b_input_mask, b_labels = batch
  
    # don't compute/store gradients to save memory and speed up prediction

    with torch.no_grad():
        # forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask) 
        
    logits = outputs[0]  
    
    # move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)
    
print('\nDONE.')

Predicting labels for 6 test documents...

DONE.


In [293]:
print('Privacy articles samples: %d of %d (%.2f%%)' % (validation_df.label.sum(), len(validation_df.label), (validation_df.label.sum() / len(validation_df.label) * 100.0)))

Privacy articles samples: 3 of 6 (50.00%)


In [294]:
from sklearn.metrics import matthews_corrcoef

matthews_set = []

# evaluate each test batch using Matthew's correlation coefficient
print('Calculating Matthews Corr. Coef. for each batch...')

# for each batch
for i in range(len(true_labels)):
    # predictions are a 2-column ndarray (one for 0, one for 1)
    # pick label with highest value and turn it into 1 or 0

    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  
    # calculate and store coef for current batch
    matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
    matthews_set.append(matthews)

Calculating Matthews Corr. Coef. for each batch...


In [295]:
# combine the predictions for each batch into a single list of 0s and 1s

flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# combine the correct labels for each batch into a single list
flat_true_labels = [item for sublist in true_labels for item in sublist]

# calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('MCC: %.3f' % mcc)

MCC: 0.707


In [296]:
pred_labels_i

array([1, 1, 0, 0, 1, 1])

In [297]:
true_labels

[array([1, 0, 0, 0, 1, 1])]

In [254]:
# function to get privacy label
def is_privacy(d):
    
    # tokenize
    encoded_sent = tokenizer.encode(d, add_special_tokens = True,)
    
    # pad
    input_id = pad_sequences([encoded_sent], maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")
    #print(input_id)
    
    # create attention mask
    seq_mask = [float(i>0) for i in input_id[0]]

    # convert to tensors
    prediction_input = torch.tensor(input_id)
    prediction_mask = torch.tensor([seq_mask])
    output = model(prediction_input, token_type_ids=None, 
                      attention_mask=prediction_mask) 
        
    logits = output[0]  
    
    # move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    
    # store predictions and true labels
    verdict = np.argmax(logits[0])

    return verdict


In [255]:
validation_contents.keys()

dict_keys(['NYT_3713', 'AFR_2248', 'NYT_3314', 'AFR_2251', 'AFR_2256', 'NYT_4136'])

In [281]:
test_a = "AFR_2251"

In [282]:
print(test_a)
print(validation_contents[test_a]["label"])

AFR_2251
0


In [283]:
article_verdict = is_privacy(validation_contents["AFR_2248"]["text"])
article_verdict

1