In [1]:
!pip install Sentencepiece
!pip install transformers



In [None]:
import torch
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BigBirdTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, BigBirdForSequenceClassification, GPT2Tokenizer, GPTNeoForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import nltk

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
import torch.optim as optim
import torch.nn.functional as F

from models import *

In [3]:
# from google.colab import drive
# drive._mount('/content/drive')

# import os
# os.chdir('drive/MyDrive/machine_learning')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# # load dataset
data_train = pd.read_csv('dataset_binary_ablation.csv')
data_test = pd.read_csv('dataset_binary_test.csv')

X_train, y_train = data_train.data.tolist(), data_train.label.tolist()
X_test, y_test = data_test.data.tolist(), data_test.label.tolist()

In [6]:
def preprocess_data_transformer(transformer_name):
    if transformer_name == 'BERT':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    elif transformer_name == 'GPT2':
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)
        tokenizer.pad_token = tokenizer.eos_token
    elif transformer_name == 'BIGBIRD':
        tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base', do_lower_case=True)

    for mode in ['train', 'test']:

        sample_ids = []
        attention_masks = []
      
        samples = X_train if mode == 'train' else X_test
        labels = y_train if mode == 'train' else y_test
        length = len(samples)
      
        for i, sent in enumerate(samples):
            encoded_dict = tokenizer.encode_plus(sent, add_special_tokens = True, max_length = 100, truncation = True, \
                                              padding = 'max_length', return_attention_mask = True, return_tensors = 'pt')

            # Add the encoded sample and mask 
            sample_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])
            print('\r----- Processing {}/{} {} samples'.format(i+1, length, mode), flush=True, end='')

        # Convert to pytorch tensors.
        sample_ids = torch.cat(sample_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)
        labels = torch.tensor(labels)
      
        if mode == 'train': train_dataset = TensorDataset(sample_ids, attention_masks, labels)
        else: test_dataset = TensorDataset(sample_ids, attention_masks, labels)

    return train_dataset, test_dataset  

In [None]:
transformer_names = ['BERT', 'GPT2', 'BIGBIRD']

for name in transformer_names:
    if name == 'BERT':
        bert_train_dataset, bert_test_dataset = preprocess_data_transformer(name)
    elif name == 'GPT2':
        gpt2_train_dataset, gpt2_test_dataset = preprocess_data_transformer(name)
    elif name == 'BIGBIRD':
        bigbird_train_dataset, bigbird_test_dataset = preprocess_data_transformer(name)     

----- Processing 217923/217923 test samples

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

----- Processing 217923/217923 test samples

Downloading:   0%|          | 0.00/826k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/775 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

----- Processing 217923/217923 test samples

In [None]:
# save preprocessed text
torch.save(bert_train_dataset, 'random_forest_bert_train.pt')
torch.save(bert_test_dataset, 'random_forest_bert_test.pt')

torch.save(gpt2_train_dataset, 'random_forest_gpt2_train.pt')
torch.save(gpt2_test_dataset, 'random_forest_gpt2_test.pt')

torch.save(bigbird_train_dataset, 'random_forest_bigbird_train.pt')
torch.save(bigbird_test_dataset, 'random_forest_bigbird_test.pt')

In [7]:
# load all models
bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2, output_attentions = False, output_hidden_states = True)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model1 = torch.load('bert-unfreeze.pkl')
model2 = transformer_classifier(bert, torch.load('LSTM-bert-embedding.pkl'), device=device)
model3 = transformer_classifier(bert, torch.load('cnn-embedding.pkl'), device=device)
model4 = torch.load('fc-gpt2.pkl')
model5 = torch.load('fc-bigbird-epoch3.pkl')

models = {}
models['bert_unfreeze'] = model1.to(device)
models['bert_lstm'] = model2
models['bert_cnn'] = model3
models['gpt2'] = model4.to(device)
models['bigbird'] = model5.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
def predict(dataloader, model, name):
    """This function is for a pre-trined model to preict the new dataset"""
    features = []
    for b, (x_id, x_mask, y) in enumerate(dataloader):
        x_id, x_mask, y = x_id.to(device), x_mask.to(device), y.to(device)

        if 'lstm' in name or 'cnn' in name:
            pred = model(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)
    
        else:
            with torch.no_grad():  
                output = model(x_id, token_type_ids=None, attention_mask=x_mask, labels=y)
                loss, logits = output['loss'], output['logits']
                pred = logits.max(1)[1]

        features.append(pred.cpu().detach().numpy())
  
    return np.concatenate(np.array(features), axis = 0)

In [None]:
# Each model predicts the new dataset
batch_size = 64
device = 'cuda' if torch.cuda.is_available() else 'cpu'
random_forest_features_train = []
random_forest_features_test = []

for name, model in models.items():
    print('create features by {}'.format(name))

    if 'bert' in name:
        train_dataloader = DataLoader(bert_train_dataset, sampler = SequentialSampler(bert_train_dataset), batch_size = batch_size)
        test_dataloader = DataLoader(bert_test_dataset, sampler = SequentialSampler(bert_test_dataset), batch_size = batch_size)
    if 'gpt2' in name:
        train_dataloader = DataLoader(gpt2_train_dataset, sampler = SequentialSampler(gpt2_train_dataset), batch_size = batch_size)
        test_dataloader = DataLoader(gpt2_test_dataset, sampler = SequentialSampler(gpt2_test_dataset), batch_size = batch_size)
    if 'bigbird' in name:
        train_dataloader = DataLoader(bigbird_train_dataset, sampler = SequentialSampler(bigbird_train_dataset), batch_size = batch_size)
        test_dataloader = DataLoader(bigbird_test_dataset, sampler = SequentialSampler(bigbird_test_dataset), batch_size = batch_size)


    train_feature = predict(train_dataloader, model, name)
    test_feature = predict(test_dataloader, model, name)
    random_forest_features_train.append(train_feature) 
    random_forest_features_test.append(test_feature) 

create features by bert_unfreeze
create features by bert_lstm
create features by bert_cnn
create features by gpt2
create features by bigbird




In [None]:
# save the features of random forests
np.save('random_forest_train.npy', np.array(random_forest_features_train).T)
np.save('random_forest_test.npy', np.array(random_forest_features_test).T)

In [None]:
# load the features of random forests
X_train_rf = np.load('random_forest_train.npy')
X_test_rf = np.load('random_forest_test.npy')

**Ensemble learning through Random Forest:**

In [None]:
# 10-fold cross validation for random forests
rf = RandomForestClassifier(criterion='gini', class_weight = 'balanced', oob_score = True, random_state = 0)
parameters = {'n_estimators':[10, 50, 100, 200, 500], 'max_depth':[2, 3, 4, 5]}

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state = 0) #set random_state to be 0
clf = GridSearchCV(rf,parameters,scoring='roc_auc',n_jobs=-1,cv=kf)
clf.fit(X_train_rf, y_train)

print(clf.best_params_)

{'max_depth': 3, 'n_estimators': 100}


In [None]:
# test performance of random forests with the best parameters
clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)
clf.fit(X_train_rf, y_train)
predict = clf.predict(X_test_rf)
accuracy = (predict == y_test).mean()

print('The test accuracy of random forest is {:.4f}'.format(accuracy))

The test accuracy of random forest is 0.9051


**Ensemble learning through neural networks:**

In [None]:
model = nn.Sequential(nn.Linear(5, 2))


train_features = torch.tensor(X_train_rf).type(torch.float)
test_features = torch.tensor(X_test_rf).type(torch.float)
train_labels =  torch.tensor(np.array(y_train)).type(torch.long)
test_labels =  torch.tensor(np.array(y_test)).type(torch.long)

train_dataset = TensorDataset(train_features, train_labels)
test_dataset = TensorDataset(test_features, test_labels)

batch_size = 64
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)
val_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = batch_size)


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

n_batch = int(len(train_dataset)/batch_size)
max_epoch = 20

optimizer = torch.optim.SGD(model.parameters(), lr = 0.01) # the learning rate is suggested by the authors
#scheduler =  optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
criterion = F.cross_entropy

print('Training start!')
for e in range(max_epoch):
    # train model
    model.train()
    
    epoch_loss = 0
    train_acc = 0
    for b, (x, y) in enumerate(train_dataloader):
        x,y = x.to(device), y.to(device)
        optimizer.zero_grad()

        output = model(x)
        loss = criterion(output, y)
        epoch_loss += loss
        train_acc += (output.max(1)[1] == y).float().mean().item()

        loss.backward()
        optimizer.step()
        #scheduler.step()
    
        print("\rEpoch: {:d} batch: {:d} / {} loss: {:.4f} | {:.2%}".format(e + 1, b, n_batch, loss, b*1.0/n_batch), end='', flush=True)
    
    print("\n----- Epoch {} ------\nTraining loss: {}".format(e+1, epoch_loss / len(train_dataloader)))
    print("Training accuracy: {}".format(train_acc / len(train_dataloader)))

    # evaluate model
    model.eval()
  
    eval_acc = 0
    eval_loss = 0
    nb_eval_steps = 0
  
    for b, (x, y) in enumerate(val_dataloader):
        x,y = x.to(device), y.to(device)
      
        with torch.no_grad():
            output = model(x)
            loss = criterion(output, y)

        eval_loss += loss
        eval_acc += (output.max(1)[1] == y).float().mean().item()

    print("Validation loss: {}".format(eval_loss / len(val_dataloader)))
    print("Validation accuracy: {}".format(eval_acc / len(val_dataloader)))
    print("\n")


Training start!
Epoch: 1 batch: 3064 / 3064 loss: 0.2031 | 100.00%
----- Epoch 1 ------
Training loss: 0.31994524598121643
Training accuracy: 0.8987884525762686
Validation loss: 0.2790996730327606
Validation accuracy: 0.9028216138245845


Epoch: 2 batch: 3064 / 3064 loss: 0.1713 | 100.00%
----- Epoch 2 ------
Training loss: 0.27641671895980835
Training accuracy: 0.9040239745991071
Validation loss: 0.27593910694122314
Validation accuracy: 0.9033904629144259


Epoch: 3 batch: 3064 / 3064 loss: 0.2302 | 100.00%
----- Epoch 3 ------
Training loss: 0.27493050694465637
Training accuracy: 0.9051047250069374
Validation loss: 0.2753564715385437
Validation accuracy: 0.9044058279506753


Epoch: 4 batch: 3064 / 3064 loss: 0.3412 | 100.00%
----- Epoch 4 ------
Training loss: 0.2745201289653778
Training accuracy: 0.9051175425336567
Validation loss: 0.27515313029289246
Validation accuracy: 0.9045251027598356


Epoch: 5 batch: 3064 / 3064 loss: 0.1526 | 100.00%
----- Epoch 5 ------
Training loss: 0.27