In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.1-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 4.3 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 39.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 70.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 65.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

In [None]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn import CrossEntropyLoss, ReLU,DataParallel
from torch.optim import AdamW
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler, WeightedRandomSampler
from transformers import get_linear_schedule_with_warmup

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import logging
from _datetime import datetime as dt0
import math
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import seaborn as sn

In [None]:
def gelu(x):
    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

def gelu_new(x):
    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
        Also see https://arxiv.org/abs/1606.08415
    """
    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

def swish(x):
    return x * torch.sigmoid(x)

In [None]:
class MulticlassClassification(nn.Module):
    def __init__(self, num_feature, num_class):
        super(MulticlassClassification, self).__init__()

        self.layer_1 = nn.Linear(num_feature, 64)
        self.layer_2 = nn.Linear(64, 16)
        self.layer_out = nn.Linear(16, num_class) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(16)

        # self.layer_1 = nn.Linear(num_feature,32)
        # self.layer_out = nn.Linear(32, num_class)

        # self.batchnorm1 = nn.BatchNorm1d(32)
        
    def forward(self, x):

        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = gelu(x)
        # x = self.relu(x)

        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = gelu(x)
        # x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)

        # x = self.layer_1(x)
        # x = self.batchnorm1(x)
        # x = gelu(x)

        # x = self.layer_out(x)
        
        return x

In [None]:
def process_labels(label):
    if label == "O":
        return 0
    elif label=="B-NEG":
        return 1
    elif label == "I-NEG":
        return 2
    else:
        print(label)
        print("Error")
        return 2

def load_train_val_data():

    train_df = pd.read_csv('trainset_final.csv', sep = ";")
    dev_df = pd.read_csv('devset_final.csv', sep = ";")   

    # train_df = train_df.loc[ train_df["id"].str.contains("baskervilles")]

    train_df["Label"] = train_df["Label"].apply(lambda x: process_labels(x))
    dev_df["Label"] = dev_df["Label"].apply(lambda x: process_labels(x))

    train_labels = train_df["Label"]
    dev_labels = dev_df["Label"]

    train_df.drop(["Label", "id"], axis =1, inplace = True)
    dev_df.drop(["Label","id"], axis =1, inplace = True)

    missing_features_dev = [col for col in train_df.columns if col not in dev_df]

    dev_df[missing_features_dev] = 0

    substring_list = ["Lemma", "Grammar_phrase", "Has", "POS_", "next_1", "previous_1", "next_2", "previous_2", "Is_", "Named_Entity"]

    columns_to_keep = [col for col in train_df.columns if any(substring in col for substring in substring_list)]

    train_df = train_df[columns_to_keep]
    dev_df = dev_df[columns_to_keep]

    print( len(train_df.columns), len(dev_df.columns))

    # train_df = train_df[['Lemma_embedding_1', 'Lemma_embedding_2', 'Lemma_embedding_3', 'Lemma_embedding_4', 'Lemma_embedding_5', 'Label']]
    # dev_df = dev_df[['Lemma_embedding_1', 'Lemma_embedding_2', 'Lemma_embedding_3', 'Lemma_embedding_4', 'Lemma_embedding_5', 'Label']]

    return train_df, dev_df, train_labels, dev_labels





In [None]:
class ClassifierDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


In [None]:
def return_class_weights(array):
    counts = [0,0,0]

    for item in array:
        if item == 0:
            counts[0] +=1
        elif item == 1:
            counts[1] +=1
        else:
            counts[2] +=1

    return np.asarray( counts )

In [None]:
train_df, dev_df, train_labels, dev_labels = load_train_val_data()
X_train = train_df.to_numpy()
y_train = train_labels.to_numpy().astype('int64')
X_dev = dev_df.to_numpy()
y_dev = dev_labels.to_numpy().astype('int64')

train_dataset = ClassifierDataset(torch.from_numpy( X_train ).float(), torch.from_numpy(y_train).long())
val_dataset = ClassifierDataset(torch.from_numpy(X_dev ).float(), torch.from_numpy(y_dev).long())

#For the class imbalance
target_list = []
for _, t in train_dataset:
    target_list.append(t)
target_list = torch.tensor(target_list)

class_count = return_class_weights(y_train)
class_weights = 1./torch.tensor(class_count, dtype=torch.float) 
class_weights_all = class_weights[target_list]


for col in train_df.columns:
    print(col)

81 81
Is_popular_token
Is_begin_token
Is_end_token
Is_early_token
Has_possible_prefix
Has_subfix
Has_correlating_synonym
Has_correct_antonym
Has_antonyms
Grammar_phrase_CLAUSE
Grammar_phrase_CLAUSE_2
Grammar_phrase_NP_2
Grammar_phrase_PP
Grammar_phrase_PP_2
Grammar_phrase_VP
Grammar_phrase_VP_2
POS_CD
POS_DT
POS_EX
POS_FW
POS_IN
POS_JJ
POS_JJR
POS_JJS
POS_MD
POS_NN
POS_NNP
POS_NNPS
POS_NNS
POS_PDT
POS_PRP
POS_PRP$
POS_RB
POS_RBR
POS_RBS
POS_RP
POS_TO
POS_UH
POS_VB
POS_VBD
POS_VBG
POS_VBN
POS_VBP
POS_VBZ
POS_WDT
POS_WP
POS_WP$
POS_WRB
Named_Entity_GPE
Named_Entity_GSP
Named_Entity_LOCATION
Named_Entity_ORGANIZATION
Named_Entity_PERSON
POS_CC
POS_LS
POS_SYM
Lemma_embedding_1
Lemma_embedding_2
Lemma_embedding_3
Lemma_embedding_4
Lemma_embedding_5
next_1_0
next_1_1
next_1_2
next_1_3
next_1_4
next_2_0
next_2_1
next_2_2
next_2_3
next_2_4
previous_1_0
previous_1_1
previous_1_2
previous_1_3
previous_1_4
previous_2_0
previous_2_1
previous_2_2
previous_2_3
previous_2_4


In [None]:
weighted_sampler = WeightedRandomSampler(
    weights=class_weights_all,
    num_samples=len(class_weights_all),
    replacement=True
)

train_loader = DataLoader(dataset=train_dataset,
                    batch_size=32,
                    sampler= weighted_sampler
)
val_loader = DataLoader(dataset=val_dataset, 
                    batch_size=16
)

In [None]:
# Hidden layer size based on formula here : https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw#:~:text=The%20number%20of%20hidden%20neurons,size%20of%20the%20input%20layer.
# alpha I chose here is 10, but that still gave me a large upperbound, so chose 50 for now

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = MulticlassClassification(len(train_df.columns), 3)

criterion = nn.CrossEntropyLoss() #

# Best known optimiser: AdamW
optimizer = AdamW(model.parameters(),
                  lr=0.0005,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps=1e-8  # args.adam_epsilon  - default is 1e-8.
                )

epochs = 200

batch_size =16

total_steps = len(train_loader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=total_steps / 5,  # Default value in run_glue.py
                                            num_training_steps=total_steps)


In [None]:
model.to(device)
criterion.to(device)

CrossEntropyLoss()

In [None]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc * 100)
    
    return acc

accuracy_stats = {
    'train': [],
    "val": []
}

loss_stats = {
    'train': [],
    "val": []
}


In [None]:

for e in tqdm(range(1, epochs+1)):
    
    # TRAINING
    train_epoch_loss = 0
    train_epoch_acc = 0
    model.train()

    train_predictions = []
    val_predictions = []

    true_train_values = []
    true_val_values = []

    for X_train_batch, y_train_batch in train_loader:
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        optimizer.zero_grad(set_to_none=True)
        
        y_train_pred = model(X_train_batch)

        train_loss = criterion(y_train_pred, y_train_batch)
        train_acc = multi_acc(y_train_pred, y_train_batch)
        
        train_loss.backward()
        optimizer.step()
        scheduler.step()
        
        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()

        y_pred = np.asarray( y_train_pred.argmax(1).cpu().detach() ,dtype = int )
        y_batch = np.asarray( y_train_batch.cpu().detach(), dtype = int)
        
        train_predictions.extend( y_pred )
        true_train_values.extend( y_batch )
    
    # VALIDATION    
    with torch.no_grad():
        
        val_epoch_loss = 0
        val_epoch_acc = 0
        
        model.eval()
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            
            y_val_pred = model(X_val_batch)

            val_loss = criterion(y_val_pred, y_val_batch)
            val_acc = multi_acc(y_val_pred, y_val_batch)
            
            val_epoch_loss += val_loss.item()
            val_epoch_acc += val_acc.item()

            y_pred = np.asarray( y_val_pred.argmax(1).cpu().detach(), dtype = int)
            y_batch = np.asarray( y_val_batch.cpu().detach(), dtype = int )
            
            val_predictions.extend( y_pred )
            true_val_values.extend( y_batch )

    # train_predictions = [ list(x.cpu().detach().numpy()) for x in train_predictions]
    # train_predictions = [x.index(max(x)) for x in train_predictions]
    # true_train_values = [x.cpu().detach().numpy() for x in true_train_values]

    # val_predictions = [list(x.cpu().detach().numpy()) for x in val_predictions]
    # val_predictions = [x.index(max(x)) for x in val_predictions]
    # true_val_values = [x.cpu().detach().numpy() for x in true_val_values]
    
    loss_stats['train'].append(train_epoch_loss/len(train_loader))
    loss_stats['val'].append(val_epoch_loss/len(val_loader))
    accuracy_stats['train'].append(train_epoch_acc/len(train_loader))
    accuracy_stats['val'].append(val_epoch_acc/len(val_loader))
                            
    print("---------------------------------------------")
    print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f}\
    | Train Acc: {train_epoch_acc/len(train_loader):.3f}| Val Acc: {val_epoch_acc/len(val_loader):.3f}')

    print("Training F1: ", f1_score(y_true = true_train_values, y_pred = train_predictions, average='macro') )
    print("Validation F1: ", f1_score(y_true = true_val_values, y_pred = val_predictions, average='macro') )

    print("---------------------------------------------")

  0%|          | 0/200 [00:00<?, ?it/s]

---------------------------------------------
Epoch 001: | Train Loss: 1.08238 | Val Loss: 0.82463    | Train Acc: 40.127| Val Acc: 84.958
Training F1:  0.3957856378566739
Validation F1:  0.3342766264500983
---------------------------------------------
---------------------------------------------
Epoch 002: | Train Loss: 0.77274 | Val Loss: 0.72369    | Train Acc: 72.367| Val Acc: 68.185
Training F1:  0.7223847789319239
Validation F1:  0.2933526391917484
---------------------------------------------
---------------------------------------------
Epoch 003: | Train Loss: 0.48596 | Val Loss: 0.72070    | Train Acc: 86.566| Val Acc: 62.664
Training F1:  0.8636879778955638
Validation F1:  0.2791578061368848
---------------------------------------------
---------------------------------------------
Epoch 004: | Train Loss: 0.30179 | Val Loss: 0.77637    | Train Acc: 91.177| Val Acc: 64.190
Training F1:  0.909385271360743
Validation F1:  0.28344166610803256
----------------------------------

KeyboardInterrupt: ignored

In [None]:
def load_test_data(train_df):
    test_df = pd.read_csv('testset_final.csv', sep = ";")   

    test_df["Label"] = test_df["Label"].apply(lambda x: process_labels(x))

    test_labels = test_df["Label"]

    test_df.drop(["Label", "id"], axis =1, inplace = True)

    available_features_test = [col for col in test_df.columns if col in train_df.columns]
    missing_features_test = [col for col in train_df.columns if col not in available_features_test]

    test_df = test_df[available_features_test]
    test_df[missing_features_test] = 0

    print( len(test_df.columns), len(train_df.columns) )

    return test_df, test_labels

test_df, test_labels = load_test_data(train_df)


In [None]:
X_test = test_df.to_numpy()
y_test = test_labels.to_numpy().astype('int64')

test_dataset = ClassifierDataset(torch.from_numpy( X_test ).float(), torch.from_numpy(y_test).long())

test_loader = DataLoader(dataset=test_dataset, 
                    batch_size=16
)

test_predictions = []

In [None]:
with torch.no_grad():
    
    model.eval()
    for X_test_batch, y_test_batch in test_loader:
        X_test_batch, y_test_batch = X_test_batch.to(device), y_test_batch.to(device)
        
        y_test_pred = model(X_test_batch)

        y_pred = np.asarray( y_test_pred.argmax(1).cpu().detach(), dtype = int)
        test_predictions.extend( y_pred )


In [None]:
clsf_report = pd.DataFrame(classification_report(y_true = test_labels, y_pred = test_predictions, output_dict = True)).transpose()
clsf_report

In [None]:
def revert_labels(label):
    if label == 0:
        return "O"
    elif label==1:
        return "B-NEG"
    elif label == 2:
        return "I-NEG"
    else:
        print(label)
        print("Error")
        return 0

conf_df = pd.DataFrame(list(zip(test_labels, test_predictions)), columns = ["Label", "Predictions"])

conf_df["Label"] = conf_df["Label"].apply(lambda x: revert_labels(x))
conf_df["Predictions"] = conf_df["Predictions"].apply(lambda x: revert_labels(x))


confusion_matrix = pd.crosstab(conf_df['Label'], conf_df['Predictions'], rownames=['Actual'], colnames = ['Predicted'])
sn.heatmap(confusion_matrix, annot=True, cmap='Blues', fmt='g')
plt.suptitle('Neural Networks')
#plt.show()
plt.savefig('Neural Networks.pdf')

In [None]:
# !pip install skorch

In [None]:
# from skorch import NeuralNetClassifier
# from sklearn.model_selection import GridSearchCV

# net = NeuralNetClassifier(model
#                          , max_epochs=100
#                          , lr=0.001
#                          , verbose=1
#                          , criterion = criterion)

# params = {
#     'lr': [0.001,0.005, 0.01, 0.05],
#     'max_epochs': list(range(10,100, 20))}

# gs = GridSearchCV(net, params, refit=False, scoring='f1_micro', verbose=1, cv=5)
# y_train = y_train.reshape(-1, 1).squeeze()
# gs.fit(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long())

In [None]:
# print(gs.best_params_)
# lr = 0.01
# epochs = 90

In [None]:
# from sklearn.metrics import classification_report
# classifier_NN = gs.best_estimator_
# y_dev = y_dev.reshape(-1, 1).squeeze()
# predictions_NN = classifier_NN.predict(torch.from_numpy(X_dev).float())
# clsf_report = pd.DataFrame(classification_report(y_true=y_dev, y_pred=predictions_NN, output_dict = True)).transpose()
# clsf_report