In [None]:
!pip install Sentencepiece
!pip install transformers

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BigBirdTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, BigBirdForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import nltk

from models import *

In [None]:
# from google.colab import drive
# drive._mount('/content/drive')

# import os
# os.chdir('drive/MyDrive/machine_learning')

In [None]:
# # load dataset
data_train = pd.read_csv('dataset_binary_train.csv')
data_test = pd.read_csv('dataset_binary_test.csv')

X_train, y_train = data_train.data.tolist(), data_train.label.tolist()
X_test, y_test = data_test.data.tolist(), data_test.label.tolist()

In [None]:
def test_acc(test_dataset, transformer_name, transformer, classifier_name=None, classifier=None, batch_size=64):
    """This function is used to test each trained model"""
    # create dataloader for tensor dataset
    val_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = batch_size)
    
    # define device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # use cuda for transformer
    transformer = transformer.to(device)

    # evaluate model
    if classifier is not None:
        classifier.eval()
    transformer.eval()
    
    eval_acc = 0
    eval_loss = 0
    nb_eval_steps = 0
    criterion = F.cross_entropy
    
    for b, (x_id, x_mask, y) in enumerate(val_dataloader):
        x_id, x_mask, y = x_id.to(device), x_mask.to(device), y.to(device)
        
        with torch.no_grad():
            if classifier is not None:
                word_embedding = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)['hidden_states'][-1]   
                logits = classifier(word_embedding)
                loss = criterion(logits, y)
            else:
                outputs = transformer(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)
                loss, logits = outputs['loss'], outputs['logits']
        
        eval_loss += loss
        eval_acc += (logits.max(1)[1] == y).float().mean().item()

    print("Validation loss: {}".format(eval_loss / len(val_dataloader)))
    print("Validation accuracy: {}".format(eval_acc / len(val_dataloader)))
    print("\n")

    print('The test accuracy of {} {} is {}'.format(transformer_name, classifier_name, eval_acc / len(val_dataloader)))

**Test fine-tuned BERT**

In [None]:
transformer = Transformer('BERT')
test_dataset = transformer.preprocess_data(X_train=None, X_test=X_test, y_train=None, y_test=y_test)

bert = torch.load('bert-unfreeze.pkl')
test_acc(test_dataset, transformer_name='BERT', transformer=bert)

**Test fine-tuned GPT2**

In [None]:
transformer = Transformer('GPT2')
test_dataset = transformer.preprocess_data(X_train=None, X_test=X_test, y_train=None, y_test=y_test)

gpt2 = torch.load('fc-gpt2.pkl')
test_acc(test_dataset, transformer_name='GPT2', transformer=gpt2)

**Test fine-tuned BIGBIRD**

In [None]:
transformer = Transformer('BIGBIRD')
test_dataset = transformer.preprocess_data(X_train=None, X_test=X_test, y_train=None, y_test=y_test)

bigbird = torch.load('fc-bigbird-epoch3.pkl')
test_acc(test_dataset, transformer_name='BIGBIRD', transformer=bigbird)

**Test fine-tuned BERT + BiLSTM**

In [None]:
lstm = torch.load('LSTM-bert-embedding-LSTM.pkl')
bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = True)
transformer = Transformer('BERT')

test_dataset = transformer.preprocess_data(X_train=None, X_test=X_test, y_train=None, y_test=y_test)
test_acc(test_dataset, transformer_name='BERT', transformer=bert, classifier_name='BiLSTM', classifier=lstm)


**Test fine-tuned BERT + TextCNN**

In [None]:
textcnn = torch.load('cnn-embedding.pkl')
bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = True)
transformer = Transformer('BERT')

test_dataset = transformer.preprocess_data(X_train=None, X_test=X_test, y_train=None, y_test=y_test)
test_acc(test_dataset, transformer_name='BERT', transformer=bert, classifier_name='TextCNN', classifier=textcnn)

**Test fine-tuned BERT with large dataset**

In [None]:
# # load large dataset (~5 million sentences)

data_large_train = pd.read_csv('dataset_binary_train_large.csv')
data_large_test = pd.read_csv('dataset_binary_test_large.csv')


X_large_train, y_large_train = data_large_train.data.tolist(), data_large_train.label.tolist()
X_large_test, y_large_test = data_large_test.data.tolist(), data_large_test.label.tolist()

print('Train dataset length: {}'.format(len(X_large_train)))
print('Test dataset length: {}'.format(len(X_large_test)))

In [None]:
transformer = Transformer('BERT')
bert = torch.load('bert-large.pkl')

test_dataset = transformer.preprocess_data(X_train=None, X_test=X_large_test, y_train=None, y_test=y_large_test)
test_acc(test_dataset, transformer_name='BERT', transformer=bert)

**Test fine-tuned BERT with multi-label data**

In [None]:
# # load multi-label dataset

data_multi_train = pd.read_csv('dataset_multi_num_train.csv')
data_multi_test = pd.read_csv('dataset_multi_num_test.csv')

X_multi_train, y_multi_train = data_multi_train.data.tolist(), data_multi_train.label.tolist()
X_multi_test, y_multi_test = data_multi_test.data.tolist(), data_multi_test.label.tolist()

print('Train dataset length: {}'.format(len(X_multi_train)))
print('Test dataset length: {}'.format(len(X_multi_test)))

In [None]:
transformer = Transformer('BERT')
bert = torch.load('bert-multi.pkl')

test_dataset = transformer.preprocess_data(X_train=None, X_test=X_multi_test, y_train=None, y_test=y_multi_test)
test_acc(test_dataset, transformer_name='BERT', transformer=bert)