In [109]:
## Importing the libraries

import pandas as pd
from tqdm import tqdm
import os
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.metrics import classification_report as report
from sklearn.feature_extraction.text import CountVectorizer
import argparse
import torch
import numpy as np
import torch.nn as nn
torch.cuda.empty_cache()


In [110]:
def make_dataframe(input_folder, labels_folder=None):
    #MAKE TXT DATAFRAME
    text = []
    
    for fil in tqdm(filter(lambda x: x.endswith('.txt'), os.listdir(input_folder))):

        iD, txt = fil[7:].split('.')[0], open(input_folder +fil, 'r', encoding='utf-8').read() 
        text.append((iD, txt))

    df_text = pd.DataFrame(text, columns=['id','text']).set_index('id')

    df = df_text

    #MAKE LABEL DATAFRAME
    if labels_folder:
        labels = pd.read_csv(labels_folder, sep='\t', header=None)
        labels = labels.rename(columns={0:'id',1:'type'})
        labels.id = labels.id.apply(str)
        labels = labels.set_index('id')

        #JOIN
        df = labels.join(df_text)[['text','type']]

    return df

   



## Load the dataset

In [111]:
## Making the dataframe

folder_train ="../input/semeval-data/train-articles-subtask-1/"
labels_train_fn ="../input/semeval-data/train-labels-subtask-1.txt"
folder_dev = "../input/semeval-data/dev-articles-subtask-1/"

 #Read Data
print('Loading training...')
train = make_dataframe(folder_train, labels_train_fn)
 
print('Loading dev...')
test = make_dataframe(folder_dev)


Loading training...


433it [00:00, 1542.52it/s]


Loading dev...


83it [00:00, 2098.87it/s]


In [112]:
## Example training text

print((train["text"][0]))

Chelsea Handler Admits She’s ‘Very Sexually Attracted to Robert Mueller’

Far-left comedienne Chelsea Handler has admitted she is “very sexually attracted” to FBI Special Counsel Robert Mueller, just hours after he concluded his investigations into supposed collusion between Russia and Donald Trump’s presidential campaign.
On Friday evening, the Justice Department announced that Mueller had concluded his investigations into the matter and that he would not be recommending any more indictments against Trump or anyone else involved in his campaign.
Yet despite the weight of disappointment felt by the majority on the Hollywood left, Chelsea Handler admitted that her obsession with the 74-year-old prosecutor may have been because she found him sexually attractive.
“If I’m being completely honest, I am very sexually attracted to Robert Mueller,” she wrote on Twitter.
“I know it’s not meant to be, but that doesn’t mean I won’t hang a poster of him above my bed.”
If I’m being completely hones

In [113]:
# Example length of article 

print(len(train["text"]))

433


In [114]:
# Import nltk library

import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [115]:
## Here are some puctuations to remove from the dataset


puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]


## Clean the text 

def clean_text(x):
    x = str(x)

    for punct in puncts:
       if punct in x:
          x = x.replace(punct, f' {punct} ')

        
    return x

import re

# Remove the numbers from the dataset

def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x    

mispell_dict = {"doesn’t":"does not","i’m":"i am","she’s":"she is","it’s":"it is","ain’t": "is not", "aren’t": "are not","can’t": "cannot", "’cause": "because", "could’ve": "could have", "couldn’t": "could not", "didn’t": "did not",  "doesn’t": "does not", "don’t": "do not", "hadn’t": "had not", "hasn’t": "has not", "haven’t": "have not", "he’d": "he would","he’ll": "he will", "he’s": "he is", "how’d": "how did", "how’d’y": "how do you", "how’ll": "how will", "how’s": "how is",  "I’d": "I would", "I’d've": "I would have", "I’ll": "I will", "I’ll've": "I will have","I’m": "I am", "I’ve": "I have", "i’d": "i would", "i’d’ve": "i would have", "i’ll": "i will",  "i’ll’ve": "i will have","i’m": "i am", "i’ve": "i have", "isn’t": "is not", "it’d": "it would", "it’d’ve": "it would have", "it’ll": "it will", "it’ll've": "it will have","it’s": "it is", "let’s": "let us", "ma’am": "madam", "mayn’t": "may not", "might’ve": "might have","mightn’t": "might not","mightn’’ve": "might not have", "must’ve": "must have", "mustn’t": "must not", "mustn’t’ve": "must not have", "needn’t": "need not", "needn’t’ve": "need not have","o’clock": "of the clock", "oughtn’t": "ought not", "oughtn’t’ve": "ought not have", "shan’t": "shall not", "sha’n’t": "shall not", "shan’t’ve": "shall not have", "she’d": "she would", "she’d’ve": "she would have", "she’ll": "she will", "she’ll’ve": "she will have", "she’s": "she is", "should’ve": "should have", "shouldn’t": "should not", "shouldn’t’ve": "should not have", "so’ve": "so have","so’s": "so as", "this’s": "this is","that’d": "that would", "that’d’ve": "that would have", "that’s": "that is", "there’d": "there would", "there’d’ve": "there would have", "there’s": "there is", "here’s": "here is","they’d": "they would", "they’d’ve": "they would have", "they’ll": "they will", "they’ll've": "they will have", "they’re": "they are", "they’ve": "they have", "to’ve": "to have", "wasn’t": "was not", "we’d": "we would", "we’d’ve": "we would have", "we’ll": "we will", "we’ll’ve": "we will have", "we’re": "we are", "we’ve": "we have", "weren’t": "were not", "what’ll": "what will", "what’ll’ve": "what will have", "what’re": "what are",  "what’s": "what is", "what’ve": "what have", "when’s": "when is", "when’ve": "when have", "where’d": "where did", "where’s": "where is", "where’ve": "where have", "who’ll": "who will", "who’ll’ve": "who will have", "who’s": "who is", "who’ve": "who have", "why’s": "why is", "why’ve": "why have", "will’ve": "will have", "won’t": "will not", "won’t've": "will not have", "would’ve": "would have", "wouldn’t": "would not", "wouldn’t’ve": "would not have", "y’all": "you all", "y’all’d": "you all would","y’all’d’ve": "you all would have","y’all’re": "you all are","y’all’ve": "you all have","you’d": "you would", "you’d’ve": "you would have", "you’ll": "you will", "you’ll’ve": "you will have", "you’re": "you are", "you’ve": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispellings, mispellings_re = _get_mispell(mispell_dict)

## Replace the typical misspelled words and shorthands(like can't= cannot)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)


# For removing punctuations
import string
string.punctuation

from nltk.stem import WordNetLemmatizer


## We lemmaatize the input text to transform similar words into one


#defining the function to remove punctuation
def remove_punc(text):
    wordnet_lemmatizer = WordNetLemmatizer() #defining the object for Lemmatization
    punctuationfree ="".join([i for i in text if i not in string.punctuation])
    punctuationfree = punctuationfree.replace('“', '')
    punctuationfree = punctuationfree.replace("‘", "")
    punctuationfree = punctuationfree.replace("’","")
    punctuationfree = punctuationfree.replace("”","")
    punctuationfree = punctuationfree.replace("—","")
    punctuationfree = punctuationfree.replace("\n"," ")
    return punctuationfree
    

In [116]:
print(mispellings_re)

re.compile("(doesn’t|i’m|she’s|it’s|ain’t|aren’t|can’t|’cause|could’ve|couldn’t|didn’t|don’t|hadn’t|hasn’t|haven’t|he’d|he’ll|he’s|how’d|how’d’y|how’ll|how’s|I’d|I’d've|I’ll|I’ll've|I’m|I’ve|i’d|i’d’ve|i’ll|i’ll)


In [117]:
## Preprocessing the training and test data 


## Convert all to lowercase
train["text"] = train["text"].apply(lambda x: x.lower())
test["text"] = test["text"].apply(lambda x: x.lower())

## Clean the numbers
train["text"] = train["text"].apply(lambda x: clean_numbers(x))
test["text"] = test["text"].apply(lambda x: clean_numbers(x))
    
# Clean spellings and remove short forms
train["text"] = train["text"].apply(lambda x: replace_typical_misspell(x))
test["text"] = test["text"].apply(lambda x: replace_typical_misspell(x))



In [118]:
#train["text"] = train["text"].apply(lambda x: clean_text(x))
#test["text"] = test["text"].apply(lambda x: clean_text(x))

## Remove punctuations
train["text"] = train["text"].apply(lambda x: remove_punc(x))
test["text"] = test["text"].apply(lambda x: remove_punc(x))


## fill up the missing values
X_train = train["text"].fillna("_##_").values
X_test = test["text"].fillna("_##_").values


In [119]:
## Example text after preprocessing

print(train["text"][0])

chelsea handler admits she is very sexually attracted to robert mueller  farleft comedienne chelsea handler has admitted she is very sexually attracted to fbi special counsel robert mueller just hours after he concluded his investigations into supposed collusion between russia and donald trumps presidential campaign on friday evening the justice department announced that mueller had concluded his investigations into the matter and that he would not be recommending any more indictments against trump or anyone else involved in his campaign yet despite the weight of disappointment felt by the majority on the hollywood left chelsea handler admitted that her obsession with the yearold prosecutor may have been because she found him sexually attractive if i am being completely honest i am very sexually attracted to robert mueller she wrote on twitter i know it is not meant to be but that does not mean i will not hang a poster of him above my bed if i am being completely honest i am very sexua

In [120]:
## Pip install the transformers library

!pip install transformers

[0m

In [121]:
## Defining the Bert model to be used for training

from transformers import BertTokenizer, BertModel

# Load pre-trained model tokenizer (vocabulary)
#tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

#from transformers import AutoTokenizer
#tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')


## Initialize the tokenizer from the bert-base-uncased model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')




In [122]:
X_train=X_train.tolist()


In [123]:
type(X_train)

list

In [124]:

X_train[0]

'chelsea handler admits she is very sexually attracted to robert mueller  farleft comedienne chelsea handler has admitted she is very sexually attracted to fbi special counsel robert mueller just hours after he concluded his investigations into supposed collusion between russia and donald trumps presidential campaign on friday evening the justice department announced that mueller had concluded his investigations into the matter and that he would not be recommending any more indictments against trump or anyone else involved in his campaign yet despite the weight of disappointment felt by the majority on the hollywood left chelsea handler admitted that her obsession with the yearold prosecutor may have been because she found him sexually attractive if i am being completely honest i am very sexually attracted to robert mueller she wrote on twitter i know it is not meant to be but that does not mean i will not hang a poster of him above my bed if i am being completely honest i am very sexu

In [125]:
import nltk
from nltk.stem import WordNetLemmatizer 

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

## Lemmatize the text

X_train_new=[]

for sent in X_train:
  word_list = nltk.word_tokenize(sent)
  lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
  X_train_new.append(lemmatized_output)


    

In [126]:
X_train_new[0]
X_train=X_train_new


## Tokenize the inputs by adding CLS, SEP, PAD





In [127]:
## Tokenize the sentences 

##(This adds padding to each of the sentences , adds a <CLS> token to the beginning of the text)

X_train_tokenized=tokenizer(X_train,padding=True,truncation=True,return_tensors="pt")


In [128]:
## Example of the token indices for an article

print((X_train_tokenized["input_ids"])[0])


tensor([  101,  9295, 28213, 14456,  2016,  2003,  2200, 12581,  6296,  2000,
         2728, 26774,  2521,  2571,  6199,  2272, 10265, 10087,  9295, 28213,
         5292,  4914,  2016,  2003,  2200, 12581,  6296,  2000,  8495,  2569,
         9517,  2728, 26774,  2074,  3178,  2044,  2002,  5531,  2010,  4812,
         2046,  4011,  8902, 24117,  2090,  3607,  1998,  6221,  8398,  4883,
         3049,  2006,  5958,  3944,  1996,  3425,  2533,  2623,  2008, 26774,
         2018,  5531,  2010,  4812,  2046,  1996,  3043,  1998,  2008,  2002,
         2052,  2025,  2022, 16755,  2075,  2151,  2062, 24265,  2114,  8398,
         2030,  3087,  2842,  2920,  1999,  2010,  3049,  2664,  2750,  1996,
         3635,  1997, 10520,  2371,  2011,  1996,  3484,  2006,  1996,  5365,
         2187,  9295, 28213,  4914,  2008,  2014, 17418,  2007,  1996,  2095,
        11614, 12478,  2089,  2031,  2042,  2138,  2016,  2179,  2032, 12581,
         8702,  2065,  1045,  2572,  2108,  3294,  7481,  1045, 

In [129]:
print((X_train_tokenized["input_ids"])[0].shape[0])

512


In [130]:
train_inputs=X_train_tokenized["input_ids"]

In [131]:
"""
model = BertModel.from_pretrained("bert-base-uncased")
outputs = model(**(X_train_tokenized))
"""

'\nmodel = BertModel.from_pretrained("bert-base-uncased")\noutputs = model(**(X_train_tokenized))\n'

In [132]:
## Example of the y-variable (Classes)

print(train["type"])

id
833042063       satire
832959523       satire
833039623       satire
833032367       satire
814777937       satire
               ...    
832908978    reporting
832910505       satire
832917532    reporting
832913653    reporting
832917778    reporting
Name: type, Length: 433, dtype: object


In [133]:
## Encode labels as 0,1,2 satire,opinion,reporting


from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
print(train['type'].unique())

# Encode labels in column 'type'.
train['type']= label_encoder.fit_transform(train['type'])
  
print(train['type'].unique())



['satire' 'opinion' 'reporting']
[2 0 1]


In [134]:
train_label=train["type"].values

In [135]:
train_label

array([2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,

In [136]:
train_labels = torch.tensor(train_label)


In [137]:
train_labels

tensor([2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [138]:
train_masks=X_train_tokenized['attention_mask']


In [139]:
train_masks[0:5]

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])

In [140]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#Creating the DataLoader which will help us to load data into the GPU/CPU
batch_size = 2

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


# The Bert-Base Model 

In this code we use the Pre-trained bert-base-model trained on English Wikepedia Texts to get some contextual representations of the tokens of the training set. The weights are also loaded from the pre-trained model and is fine-tuned to the specific task that we are trying to accomplish namely news type classification. We run training for about 8 epochs to get the classifier.The Bert model has 12 attention heads and 12 Attention layers. The <CLS> token is used for the classification task with a SoftMax layer attached to it. 

In [141]:
#Loading the pre-trained BERT model from huggingface library

from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import RobertaForSequenceClassification

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 


model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 3,   
    output_attentions = False, 
    output_hidden_states = False, )
 
"""
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", 
    
    # Specify number of classes
    num_labels = 3, 
    # Whether the model returns attentions weights
    output_attentions = False,
    # Whether the model returns all hidden-states 
    output_hidden_states = False)
"""
    
  

# Telling the model to run on GPU
model.cuda()



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [142]:
# AdamW is an optimizer which is a Adam Optimzier with weight-decay-fix
optimizer = AdamW(model.parameters(),
                  lr = 3e-5, 
                  eps = 1e-8
                )



In [143]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 8

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
#scheduler

In [144]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

    

In [145]:
#Creating the helper function to have a watch on elapsed time

import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))
    

In [146]:
import tensorflow as tf

# Checking for the GPU
device_name = tf.test.gpu_device_name()
print(device_name)



/device:GPU:0


2022-11-12 12:24:31.052056: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-12 12:24:31.052903: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-12 12:24:31.053475: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-12 12:24:31.054158: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-12 12:24:31.054747: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

In [147]:
device = torch.device("cuda")

In [148]:
device

device(type='cuda')

## Training the Bert Model

In [149]:
import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

%env PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512


# Set the seed value all over the place to make this reproducible.
seed_val = 100

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Get the weight tensor for every class (0,1,2)
train_label_np=np.array(train_label)

no_0=np.sum(train_label_np==0)
no_1=np.sum(train_label_np==1)
no_2=np.sum(train_label_np==2)

max_no = max(no_0,no_1,no_2)

no_0 = max_no/no_0
no_1 = max_no/no_1       # Weights of each label for the loss function
no_2 = max_no/no_2

weight_list = [no_0,no_1,no_2]
#weight_list=[20,20,20]

weights = torch.tensor(weight_list)
weights=weights.float()
weights= weights.to(device)

# Store the average loss after each epoch so we can plot them.
loss_values = []


# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)


        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        logits = outputs.logits
        
        #print(logits.dtype)
        #print(b_labels.dtype)
        #print(weights.dtype)
       
        
        # Compute your own loss function(to include weights of the classes)
       
        #loss=nn.CrossEntropyLoss(weight=weights,reduction="mean")
        loss=outputs[0]
        
        #loss_value=loss(logits,b_labels)
        
        

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

        # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
print("")
print("Training complete!")   

env: PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512

Training...
  Batch    40  of    217.    Elapsed: 0:00:06.
  Batch    80  of    217.    Elapsed: 0:00:11.
  Batch   120  of    217.    Elapsed: 0:00:17.
  Batch   160  of    217.    Elapsed: 0:00:22.
  Batch   200  of    217.    Elapsed: 0:00:28.

  Average training loss: 0.63
  Training epoch took: 0:00:30

Training...
  Batch    40  of    217.    Elapsed: 0:00:06.
  Batch    80  of    217.    Elapsed: 0:00:11.
  Batch   120  of    217.    Elapsed: 0:00:17.
  Batch   160  of    217.    Elapsed: 0:00:22.
  Batch   200  of    217.    Elapsed: 0:00:28.

  Average training loss: 0.62
  Training epoch took: 0:00:30

Training...
  Batch    40  of    217.    Elapsed: 0:00:06.
  Batch    80  of    217.    Elapsed: 0:00:11.
  Batch   120  of    217.    Elapsed: 0:00:17.
  Batch   160  of    217.    Elapsed: 0:00:22.
  Batch   200  of    217.    Elapsed: 0:00:28.

  Average training loss: 0.60
  Training epoch took: 0:00:30

Training...
  Batc

In [150]:
torch.cuda.memory_stats(device)

OrderedDict([('active.all.allocated', 8103307),
             ('active.all.current', 2425),
             ('active.all.freed', 8100882),
             ('active.all.peak', 2666),
             ('active.large_pool.allocated', 4240375),
             ('active.large_pool.current', 900),
             ('active.large_pool.freed', 4239475),
             ('active.large_pool.peak', 1060),
             ('active.small_pool.allocated', 3862932),
             ('active.small_pool.current', 1525),
             ('active.small_pool.freed', 3861407),
             ('active.small_pool.peak', 1727),
             ('active_bytes.all.allocated', 33701870629376),
             ('active_bytes.all.current', 5367720960),
             ('active_bytes.all.freed', 33696502908416),
             ('active_bytes.all.peak', 6699355648),
             ('active_bytes.large_pool.allocated', 33595697135616),
             ('active_bytes.large_pool.current', 5361631232),
             ('active_bytes.large_pool.freed', 33590335504384),
 

In [151]:
X_test=X_test.tolist()

In [152]:
X_test[0]

'flashback howard dean incorrectly predicts mueller will indict jared kushner  appearing november 5th  on msnbc former gov howard dean dvt wrongly predicted special counsel robert mueller would indict president donald trumps soninlaw jared kushner for money laundering as part of his investigation into possible collusion between the trump campaign and russia during the  presidential election we believe we may well have a criminal in the white house howard dean told host keir simmonscertainly he has a special interest in the trump family and their investments and we think there is substantial likelihood that he has laundered money that is what his associates have been charged with and so this is a very serious matteri do not think this about politics actually dean described mueller as a straight shooter and then predicted that he would target kushner after indicting former national security adviser michael flynn and his son michael flynn jrthe next step is going to be the trump family it

In [153]:
X_test_tokenized=tokenizer(X_test,padding=True,truncation=True,return_tensors="pt")


In [154]:
print((X_test_tokenized["input_ids"])[0])


tensor([  101, 21907,  4922,  4670, 19721, 16014,  2015, 26774,  2097, 27427,
         2594,  2102,  8334, 13970,  4095,  3678,  6037,  2281,  4833,  2006,
         5796, 28957,  2280, 18079,  4922,  4670,  1040,  2615,  2102, 29116,
        10173,  2569,  9517,  2728, 26774,  2052, 27427,  2594,  2102,  2343,
         6221,  8398,  2015,  2365,  2378, 14919,  8334, 13970,  4095,  3678,
         2005,  2769, 28289,  2004,  2112,  1997,  2010,  4812,  2046,  2825,
         8902, 24117,  2090,  1996,  8398,  3049,  1998,  3607,  2076,  1996,
         4883,  2602,  2057,  2903,  2057,  2089,  2092,  2031,  1037,  4735,
         1999,  1996,  2317,  2160,  4922,  4670,  2409,  3677, 26679,  2099,
        13672, 17119, 18249,  2135,  2002,  2038,  1037,  2569,  3037,  1999,
         1996,  8398,  2155,  1998,  2037, 10518,  1998,  2057,  2228,  2045,
         2003,  6937, 16593,  2008,  2002,  2038, 21360, 11563,  2098,  2769,
         2008,  2003,  2054,  2010,  9228,  2031,  2042,  5338, 

In [155]:
test_inputs=X_test_tokenized["input_ids"]


In [156]:
test_masks=X_test_tokenized['attention_mask']


# Predicting on the Validation Set

In [157]:
# Set the batch size.  
batch_size = 2  

# Create the DataLoader.
prediction_data = TensorDataset(test_inputs, test_masks)

prediction_dataloader = DataLoader(prediction_data,shuffle=False, batch_size=batch_size)

print("No of test sentences",len(X_test))

No of test sentences 83


In [158]:
## Predicting on the Validation set

#Evaluating our model on the test set

#Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(test_inputs)))

# Put model in evaluation mode
model.eval()

predictions=[]

for batch in prediction_dataloader:
  # Add batch to GPU

  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction

  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  
  # Store predictions and true labels
  predictions.append(logits)


Predicting labels for 83 test sentences...


In [159]:
predictions

[array([[-0.41775686,  0.11189505,  0.0826977 ],
        [-2.519041  ,  1.3257946 ,  1.2903202 ]], dtype=float32),
 array([[-2.125192  ,  1.1661922 ,  1.1847858 ],
        [-1.5541991 ,  1.4813188 ,  0.17173551]], dtype=float32),
 array([[-1.4593611 ,  1.1412126 ,  0.10558085],
        [-0.5473129 ,  0.19493614,  0.0763908 ]], dtype=float32),
 array([[-2.263418 ,  1.2279042,  1.2296252],
        [-2.0402064,  1.072484 ,  1.1377082]], dtype=float32),
 array([[-2.215379 ,  1.3910235,  0.9797152],
        [-2.443019 ,  1.3671554,  1.2740847]], dtype=float32),
 array([[-2.2742603,  1.1852673,  1.3167689],
        [-2.2773387,  1.3456517,  1.0155753]], dtype=float32),
 array([[ 5.580712 , -2.0270498, -3.678043 ],
        [-2.3739486,  1.266412 ,  1.2665095]], dtype=float32),
 array([[-2.2085266 ,  1.3779161 ,  0.9420226 ],
        [-1.3465374 ,  1.4811412 ,  0.02207996]], dtype=float32),
 array([[-2.579902  ,  1.4854283 ,  1.1201614 ],
        [-2.1096776 ,  1.3426871 ,  0.83612484]], dtype

In [160]:
pred_labels=[]
for i in range(len(predictions)):
  
  # The predictions for this batch are a 3-column ndarray (one column for "0" 
  # ,one column for "1",one for "2"). Pick the label with the highest value and turn this
  # in to a list of 0s,1s,2s.

  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  pred_labels.append(pred_labels_i)




In [161]:
pred_labels

[array([1, 1]),
 array([2, 1]),
 array([1, 1]),
 array([2, 2]),
 array([1, 1]),
 array([2, 1]),
 array([0, 2]),
 array([1, 1]),
 array([1, 1]),
 array([1, 1]),
 array([1, 1]),
 array([1, 1]),
 array([2, 1]),
 array([1, 2]),
 array([1, 1]),
 array([1, 1]),
 array([0, 0]),
 array([1, 1]),
 array([1, 2]),
 array([1, 1]),
 array([2, 1]),
 array([1, 1]),
 array([1, 1]),
 array([1, 1]),
 array([1, 2]),
 array([1, 1]),
 array([1, 1]),
 array([1, 1]),
 array([1, 1]),
 array([1, 1]),
 array([2, 1]),
 array([1, 1]),
 array([0, 2]),
 array([1, 1]),
 array([2, 1]),
 array([1, 2]),
 array([1, 1]),
 array([1, 1]),
 array([1, 1]),
 array([2, 0]),
 array([1, 2]),
 array([1])]

In [162]:
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

flat_predictions

array([1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 0, 0, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0, 2,
       1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 0, 1, 2, 1])

In [163]:
train.head(10)

Unnamed: 0_level_0,text,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1
833042063,chelsea handler admits she is very sexually at...,2
832959523,how theresa may botched those were the times…...,2
833039623,robert mueller iii rests his casedems never wi...,2
833032367,robert mueller not recommending any more indic...,2
814777937,the far right is trying to coopt the yellow ve...,2
821744708,special place in hell for those who promoted b...,2
833036489,bill maher says he doesnt need mueller report ...,2
707566605,madagascar outbreak it is inevitable the plagu...,0
708561738,how do you like paying for sexual harassment s...,2
782086447,former apostolic nuncio to the united states a...,0


In [164]:
test.head(7)

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
833024133,flashback howard dean incorrectly predicts mue...
814371058,brexit nigerians in london hope for split bri...
815858385,british yellow vest who called proeu mp a nazi...
832941978,adam schiff rejects reports that mueller indic...
833021113,roger stone associate jerome corsi celebrates ...
833067493,don trump jr uses maury povich paternity meme ...
813552066,you insult us ambassador woody johnson flagran...


In [165]:
batch_size = 2  

# Create the DataLoader.
train_prediction_data = TensorDataset(train_inputs, train_masks)


train_prediction_dataloader = DataLoader(train_prediction_data, shuffle=False, batch_size=batch_size)

print("No of train sentences",len(X_train))


No of train sentences 433


In [166]:
train_inputs[0]

tensor([  101,  9295, 28213, 14456,  2016,  2003,  2200, 12581,  6296,  2000,
         2728, 26774,  2521,  2571,  6199,  2272, 10265, 10087,  9295, 28213,
         5292,  4914,  2016,  2003,  2200, 12581,  6296,  2000,  8495,  2569,
         9517,  2728, 26774,  2074,  3178,  2044,  2002,  5531,  2010,  4812,
         2046,  4011,  8902, 24117,  2090,  3607,  1998,  6221,  8398,  4883,
         3049,  2006,  5958,  3944,  1996,  3425,  2533,  2623,  2008, 26774,
         2018,  5531,  2010,  4812,  2046,  1996,  3043,  1998,  2008,  2002,
         2052,  2025,  2022, 16755,  2075,  2151,  2062, 24265,  2114,  8398,
         2030,  3087,  2842,  2920,  1999,  2010,  3049,  2664,  2750,  1996,
         3635,  1997, 10520,  2371,  2011,  1996,  3484,  2006,  1996,  5365,
         2187,  9295, 28213,  4914,  2008,  2014, 17418,  2007,  1996,  2095,
        11614, 12478,  2089,  2031,  2042,  2138,  2016,  2179,  2032, 12581,
         8702,  2065,  1045,  2572,  2108,  3294,  7481,  1045, 

## Looking at the Train Set Predictions

In [167]:
## Predicting on the Training set

#Evaluating our final model on the training set

#Prediction on training set

print('Predicting labels for {:,} training sentences...'.format(len(train_inputs)))

# Put model in evaluation mode
model.eval()

train_predictions=[]

for batch in train_prediction_dataloader:
  # Add batch to GPU

  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction

  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  
  # Store predictions and true labels
  train_predictions.append(logits)



Predicting labels for 433 training sentences...


In [168]:
print(b_input_ids)

tensor([[  101,  3189,  2053,  2047, 24265,  2097,  2272,  2013,  2569,  9517,
          2728, 26774,  2015, 15113,  2053,  2047, 24265,  2097,  2272,  2013,
          2569,  9517,  2728, 26774,  2015,  3607, 15113,  2044,  2002,  7864,
          2010,  2345,  3189,  2000,  4905,  2236,  2520, 19820,  3674, 13307,
          2024,  7316,  1037,  3026,  3425,  2533,  2880,  2409,  1996,  2899,
          2695,  2008, 26774,  5292,  2025,  6749,  2151,  2582, 24265,  1996,
          4812,  2003,  3143,  3425,  2533,  3764, 26760, 20778, 14884,  2072,
         13970,  5051,  2278,  2409,  1996,  3780,  1996, 26774, 15113,  2387,
          2195,  2306,  8398,  8753, 21801,  2021,  2025,  8398,  2155,  2030,
          8398,  2370,  2070,  3862,  3275,  2040,  2031, 12254,  5905,  1999,
          3715, 29217,  2013, 26774,  2015,  4812,  2024,  8398,  2280,  3167,
          4905,  2745,  9946,  8398,  2280,  3049,  3472,  2703, 24951, 13028,
          1998,  2280,  2317,  2160,  2120,  3036, 1

In [169]:
print(train_inputs[432])

tensor([  101,  3189,  2053,  2047, 24265,  2097,  2272,  2013,  2569,  9517,
         2728, 26774,  2015, 15113,  2053,  2047, 24265,  2097,  2272,  2013,
         2569,  9517,  2728, 26774,  2015,  3607, 15113,  2044,  2002,  7864,
         2010,  2345,  3189,  2000,  4905,  2236,  2520, 19820,  3674, 13307,
         2024,  7316,  1037,  3026,  3425,  2533,  2880,  2409,  1996,  2899,
         2695,  2008, 26774,  5292,  2025,  6749,  2151,  2582, 24265,  1996,
         4812,  2003,  3143,  3425,  2533,  3764, 26760, 20778, 14884,  2072,
        13970,  5051,  2278,  2409,  1996,  3780,  1996, 26774, 15113,  2387,
         2195,  2306,  8398,  8753, 21801,  2021,  2025,  8398,  2155,  2030,
         8398,  2370,  2070,  3862,  3275,  2040,  2031, 12254,  5905,  1999,
         3715, 29217,  2013, 26774,  2015,  4812,  2024,  8398,  2280,  3167,
         4905,  2745,  9946,  8398,  2280,  3049,  3472,  2703, 24951, 13028,
         1998,  2280,  2317,  2160,  2120,  3036, 11747,  2745, 

In [170]:
train_predictions

[array([[-1.4365268 ,  1.4984101 ,  0.07322286],
        [-0.5493527 ,  0.19064377,  0.21301213]], dtype=float32),
 array([[-0.29887602,  0.04576249,  0.00530763],
        [-1.3622594 ,  1.485631  ,  0.03222483]], dtype=float32),
 array([[-2.525451  ,  1.6536273 ,  0.88541156],
        [-1.2913784 ,  0.6269392 ,  0.54168355]], dtype=float32),
 array([[-1.388296  ,  1.4923276 ,  0.04677911],
        [ 5.618216  , -2.1153097 , -3.588841  ]], dtype=float32),
 array([[-0.962977  ,  0.79522026,  0.07264277],
        [ 5.6306834 , -2.1049752 , -3.6007624 ]], dtype=float32),
 array([[ 5.6189237 , -2.1062634 , -3.5711184 ],
        [ 1.9873694 , -0.11316261, -2.1579328 ]], dtype=float32),
 array([[-1.5207295 ,  1.7511055 , -0.25510335],
        [ 5.626681  , -2.0573573 , -3.646981  ]], dtype=float32),
 array([[ 5.6059203 , -2.0663013 , -3.6086245 ],
        [-1.7707437 ,  1.679633  ,  0.01031851]], dtype=float32),
 array([[ 5.6087523, -2.1024652, -3.563128 ],
        [ 5.602485 , -2.1068351, -

In [171]:
flat_train_predictions = [item for sublist in train_predictions for item in sublist]
flat_train_predictions = np.argmax(flat_train_predictions, axis=1).flatten()

flat_train_predictions

array([1, 2, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#  The Final Predictions on the Validation Set

In [172]:
## Convert flat_predictions to "str"

flat_predictions=label_encoder.inverse_transform(flat_predictions)
flat_predictions

array(['reporting', 'reporting', 'satire', 'reporting', 'reporting',
       'reporting', 'satire', 'satire', 'reporting', 'reporting',
       'satire', 'reporting', 'opinion', 'satire', 'reporting',
       'reporting', 'reporting', 'reporting', 'reporting', 'reporting',
       'reporting', 'reporting', 'reporting', 'reporting', 'satire',
       'reporting', 'reporting', 'satire', 'reporting', 'reporting',
       'reporting', 'reporting', 'opinion', 'opinion', 'reporting',
       'reporting', 'reporting', 'satire', 'reporting', 'reporting',
       'satire', 'reporting', 'reporting', 'reporting', 'reporting',
       'reporting', 'reporting', 'reporting', 'reporting', 'satire',
       'reporting', 'reporting', 'reporting', 'reporting', 'reporting',
       'reporting', 'reporting', 'reporting', 'reporting', 'reporting',
       'satire', 'reporting', 'reporting', 'reporting', 'opinion',
       'satire', 'reporting', 'reporting', 'satire', 'reporting',
       'reporting', 'satire', 'reportin

In [173]:
out_fn="output-subtask1-dev_bert-en5.txt"

out = pd.DataFrame(flat_predictions, test.index)
out.to_csv(out_fn, sep='\t', header=None)
print('Results on: ', out_fn)


Results on:  output-subtask1-dev_bert-en5.txt


# Save the Model using torch.save()

In [174]:
state = {
    'epoch': epochs,
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict(),
    
}


In [177]:
import shutil
def save_ckp(state,checkpoint_path):
    """
    state: checkpoint we want to save
    checkpoint_path: path to save checkpoint
    
    """
    f_path = checkpoint_path

    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    

In [178]:
save_ckp(state,"finalcheckpoint_semeval_best.pt")

# Conclusions

The Bert-Model is able to perform appreciably on the Validation Set with a macro-f1 score of 0.4127 and a micro-f1 score of 0.60241 which is ranked 2nd in the leaderboards at the present moment. One of the things to note is that the class label "satire" is quite underrepresented in the training dataset as a result of which it may be difficult to learn. To counter this an area to look at are loss functions that are able to capture the unbalanced nature of the dataset. We could also look at ensemble models to make our predictions. We should also make an effort to look at the self attention weights between the words to understand and explain the relationships captured between them.