# Publaynet BERT Classifier





## Environment Setup
Import key libraries and working envorinments. 

In [None]:
!pip install transformers==4.2.2

In [None]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
#from transformers.configuration_bert import BertConfig
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd

# Authenticate
drive = None
def authenticate():
  global drive
  
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)

#Download files
def downloadFiles(fileIds):
  authenticate()
  
  for fileId in fileIds:    
    
    downloaded = drive.CreateFile({"id": fileId[1]})
    downloaded.GetContentFile(fileId[0])

##Get Training and Validation Dataset

In [None]:
#Do not downloading training and validation dataset at same time 
try:
  _ = open("testing_dataset.pkl", "r")
except:
  downloadFiles([["testing_dataset.pkl", "1fktW64hxcjCXreMTv_2pAxSe4Nt083z3"]])

try:
  _ = open("training_dataset.pkl", "r")
except:
  downloadFiles([["training_dataset.pkl", "1td4mF-QxrwKF125xR5DWflGqcwn0z1LP"]])

In [None]:
df_train = pd.read_pickle('training_dataset.pkl')
df_test = pd.read_pickle('testing_dataset.pkl')

In [None]:
df_train.head()

In [None]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
df_train.head(5)

In [None]:
new_df = df_train[['text', 'label','near_visual_feature',	'gcn_near_char_density',	'gcn_near_char_number',
                   'level1_parse_emb','level2_parse_emb','gcn_near_token_density','density','visual_feature','gcn_bert_predicted']]

In [None]:
new_df_test = df_test[['text', 'label','near_visual_feature',	'gcn_near_char_density',	'gcn_near_char_number',
                   'level1_parse_emb','level2_parse_emb','gcn_near_token_density','density','visual_feature','gcn_bert_predicted']]

## Data Preprocessing

In [None]:
from transformers import BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Defining some key variables that will be used later on in the training
MAX_LEN = 100
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = TRAIN_BATCH_SIZE*2
# EPOCHS = 1
LEARNING_RATE = 2e-05
# Change the pre-trained bert model
#tokenizer = BertTokenizer.from_pretrained('roberta-base') #Cased 

In [None]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.label
        self.gcn_visual_feature = dataframe.near_visual_feature
        self.visual_feature = dataframe.visual_feature
        self.gcn_bert_base = dataframe.gcn_bert_predicted
        self.parsing1 = dataframe.level1_parse_emb
        self.parsing2 = dataframe.level2_parse_emb
        self.char_density = dataframe.gcn_near_char_density
        self.char_number = dataframe.gcn_near_char_number
        self.density = dataframe.density
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float),
            'density': torch.tensor(self.density[index],dtype=torch.float),
            'gcn_bert_base': torch.tensor(self.gcn_bert_base[index],dtype=torch.float),
            'char_density': torch.tensor(self.char_density[index],dtype=torch.float),
            'char_number': torch.tensor(self.char_number[index],dtype=torch.float),
            'visual_feature': torch.tensor(self.visual_feature[index],dtype=torch.float),
            'parsing1': torch.tensor(self.parsing1[index],dtype=torch.float),
            'parsing2': torch.tensor(self.parsing2[index],dtype=torch.float),
            'gcn_visual_feature': torch.tensor(self.gcn_visual_feature[index],dtype=torch.float),
        }

In [None]:
train_size = 1
train_data=new_df.sample(frac=train_size,random_state=200)
#test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(new_df_test.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
#testing_set = SentimentData(test_data, tokenizer, MAX_LEN)
test_set = SentimentData(new_df_test,tokenizer,MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
#testing_loader = DataLoader(testing_set, **test_params)
vali_loader = DataLoader(test_set, **test_params)

## Define the proposed classifiers

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        #bert-base-cased 768
        #bert-large-cased bert-large-uncased 1024
        #roberta-base-cased 768
        #biobert

        self.l1 = AutoModel.from_pretrained("bert-base-uncased")# BERT large
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.hidden_cls = torch.nn.Linear(768,768)
        self.hidden_parsing = torch.nn.Linear(768,768)
        self.hidden_den = torch.nn.Linear(768,768)
        self.hidden_vis = torch.nn.Linear(768,768)
        self.hidden_vis_pro = torch.nn.Linear(768,768)
        self.hidden_all = torch.nn.Linear(768*2,768*2)
        self.before_classifier = torch.nn.Linear(768*2,128)
\
        self.pooling = torch.nn.MaxPool2d((2,1), stride=None)
        self.classifier = torch.nn.Linear(128, 4)

    def forward(self, input_ids, attention_mask, token_type_ids, char_density,char_number,visual_feature,bert_cls,parsing1,parsing2,visual):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]

        # BERT 768 BERT / large 1024
        
        # set different hidden layer, number of hidden units, regularization methods including bn and dropout
        
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)

        pooler = torch.cat((pooler.unsqueeze(1),bert_cls.unsqueeze(1)),1)
        pooler = self.pooling(pooler).squeeze(1)
        pooler = self.hidden_cls(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)

        visual = self.hidden_vis_pro(visual)
        visual = torch.nn.Tanh()(visual)
        visual = self.dropout(visual)

        visual = torch.cat((visual.unsqueeze(1),visual_feature.unsqueeze(1)),1)
        visual = self.pooling(visual).squeeze(1)
        visual = self.hidden_vis(visual)
        visual = torch.nn.Tanh()(visual)
        visual = self.dropout(visual)

        pooler = torch.cat((pooler,visual),1)
        pooler = self.hidden_all(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)

        pooler = self.before_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)

        output = self.classifier(pooler)
        return output

In [None]:
model = RobertaClass()
model.to(device)

## Training 

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=1e-05) # change learning rate

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    output = []
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        visual_feature = data['visual_feature'].to(device, dtype = torch.float)
        gcn_visual_feature = data['gcn_visual_feature'].to(device, dtype = torch.float)
        gcn_bert_base = data['gcn_bert_base'].to(device, dtype = torch.float)
        parsing1 = data['parsing1'].to(device, dtype = torch.float)
        parsing2 = data['parsing2'].to(device, dtype = torch.float)
        char_density = data['char_density'].to(device, dtype = torch.float)
        char_number = data['char_number'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids,char_density,char_number,gcn_visual_feature,gcn_bert_base,parsing1,parsing2,visual_feature)

        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

## Validation 

In [None]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    output_list = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            char_den = data['char_density'].to(device, dtype = torch.float)
            density = data['density'].to(device, dtype = torch.float)
            visual_feature = data['visual_feature'].to(device, dtype = torch.float)
            gcn_bert_base = data['gcn_bert_base'].to(device, dtype = torch.float)
            parsing1 = data['parsing1'].to(device, dtype = torch.float)
            parsing2 = data['parsing2'].to(device, dtype = torch.float)
            char_density = data['char_density'].to(device, dtype = torch.float)
            char_number = data['char_number'].to(device, dtype = torch.float)
            token_density = data['token_density'].to(device, dtype = torch.float)
            token_number = data['token_number'].to(device, dtype = torch.float)


            outputs = model(ids, mask, token_type_ids,visual_feature,char_density,char_number,token_density,token_number).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            output_list = output_list + list(big_idx)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu,output_list


In [None]:
acc,pre_list = valid(model, vali_loader)

## Testing

In [None]:
class SentimentData_test(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.label
        self.gcn_visual_feature = dataframe.near_visual_feature
        self.visual_feature = dataframe.visual_feature
        self.gcn_bert_base = dataframe.gcn_bert_predicted
        self.parsing1 = dataframe.level1_parse_emb
        self.parsing2 = dataframe.level2_parse_emb
        self.char_density = dataframe.gcn_near_char_density
        self.char_number = dataframe.gcn_near_char_number
        self.density = dataframe.density
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float),
            'density': torch.tensor(self.density[index],dtype=torch.float),
            'gcn_bert_base': torch.tensor(self.gcn_bert_base[index],dtype=torch.float),
            'char_density': torch.tensor(self.char_density[index],dtype=torch.float),
            'char_number': torch.tensor(self.char_number[index],dtype=torch.float),
            'visual_feature': torch.tensor(self.visual_feature[index],dtype=torch.float),
            'parsing1': torch.tensor(self.parsing1[index],dtype=torch.float),
            'parsing2': torch.tensor(self.parsing2[index],dtype=torch.float),
            'gcn_visual_feature': torch.tensor(self.gcn_visual_feature[index],dtype=torch.float),
        }

### load the test datasets

In [None]:
new_df_true_test = new_df_test[['text', 'label','near_visual_feature',	'gcn_near_char_density',	'gcn_near_char_number',
                   'level1_parse_emb','level2_parse_emb','gcn_bert_predicted','visual_feature','gcn_near_token_density','density']]

In [None]:
test = SentimentData_test(new_df_true_test,tokenizer, MAX_LEN)
testing_loader = DataLoader(test, **test_params)

In [None]:
def test_label_generator(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    output_list = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            visual_feature = data['visual_feature'].to(device, dtype = torch.float)
            gcn_visual_feature = data['gcn_visual_feature'].to(device, dtype = torch.float)
            gcn_bert_base = data['gcn_bert_base'].to(device, dtype = torch.float)
            parsing1 = data['parsing1'].to(device, dtype = torch.float)
            parsing2 = data['parsing2'].to(device, dtype = torch.float)
            char_density = data['char_density'].to(device, dtype = torch.float)
            char_number = data['char_number'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids,char_density,char_number,gcn_visual_feature,gcn_bert_base,parsing1,parsing2,visual_feature).squeeze()

            
            big_val, big_idx = torch.max(outputs.data, dim=1)
            output_list = output_list + list(big_idx)

            nb_tr_steps += 1
            
    return output_list


In [None]:
EPOCHS = 4
for epoch in range(EPOCHS):
  train(epoch)
  output = test_label_generator(model, testing_loader)
  q = []
  for p in output:
    q.append(p.cpu().numpy().tolist())
  p = new_df_test['label'].tolist()
  from sklearn.metrics import classification_report, confusion_matrix
  report = classification_report(p,q, digits=4)
  print(report)  