In [None]:
from collections import defaultdict, Counter
from sklearn.metrics import confusion_matrix
import seaborn as sns
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
import sys
import logging
import os
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)


In [None]:
# Helper functions  - Evaluating the model

def save_confusion_matrix(y_true, y_pred, output_dir, file_name):
    """
    Stores the confusion matrix in the file path
    """
    labels = [
        'None', 'KpEverTgthr', 'GetStudRelate', 'Restat',
        'Revoic', 'PrsAcc', 'PrsRsn'
    ]
    conf_matrix = confusion_matrix(y_true, y_pred)
    svm = sns.heatmap(
        conf_matrix/conf_matrix.sum(axis=1)[:, None],
        cmap='Blues',
        xticklabels=labels,
        yticklabels=labels,
        annot=True,
    )
    figure = svm.get_figure()
    figure.savefig(output_dir + "/" + file_name + ".png", dpi=600)
    figure.clear()


def custom_micro_f1_score(true_labels, pred_labels, label_count=7):
    """
    This computes a custom micro f1 score ignoring
    the predictions associated with label None (0)

    This code still has to undergo some testing
    """

    none_label = 0

    ref_count = sum([1 for true in true_labels if true != none_label])
    pred_count = sum([1 for pred in pred_labels if pred != none_label])
    match_count = sum([1 for true, pred in zip(true_labels, pred_labels)
                       if true == pred and true != none_label])

    micro_precision = match_count / pred_count if pred_count != 0 else 0
    micro_recall = match_count / ref_count if ref_count != 0 else 0
    micro_f1 = 2/(1/micro_precision + 1/micro_recall) \
        if micro_precision != 0 and micro_recall != 0 else 0

    return(micro_precision, micro_recall, micro_f1)


def custom_macro_f1_score(true_labels, pred_labels, label_count=7):
    """
    This computes a custom macro f1 score ignoring
    in its entirety, the predictions associated
    with the label None (0)


    This code still has to undergo some testing
    """

    none_label = 0

    stats = defaultdict(Counter)

    for true_label, pred_label in zip(true_labels, pred_labels):

        stats[true_label]['tp+fn'] += 1

        if true_label == pred_label:
            stats[true_label]['tp'] += 1

        stats[pred_label]['tp+fp'] += 1

    for label in set(pred_labels):
        if stats[label]['tp+fp'] != 0:
            stats[label]['precision'] = \
                stats[label]['tp']/stats[label]['tp+fp']
        if stats[label]['tp+fn'] != 0:
            stats[label]['recall'] = stats[label]['tp']/stats[label]['tp+fn']
        if stats[label]['precision'] != 0 and stats[label]['recall'] != 0:
            stats[label]['f1'] = \
                2.0/(1/stats[label]['precision'] + 1/stats[label]['recall'])

    macro_precision = sum(
        [stats[lbl]['precision'] for lbl in stats.keys() if lbl != none_label]
    )/(label_count - 1)roberta-large

    macro_recall = sum(
        [stats[lbl]['recall'] for lbl in stats.keys() if lbl != none_label]
    )/(label_count - 1)

    macro_f1 = 2/(1/macro_precision + 1/macro_recall) \
        if macro_precision != 0 and macro_recall != 0 else 0

    return macro_f1, stats


class TalkBackDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        data,
        tokenizer,
        max_seq_length
    ):

        input_examples = [
            (
                '' if datum[1].text_a is np.nan else datum[1].text_a.lower(),
                '' if datum[1].text_b is np.nan else datum[1].text_b.lower()
            ) for datum in data.iterrows()
        ]
        self.examples = tokenizer.batch_encode_plus(
            batch_text_or_text_pairs=input_examples,
            max_length=max_seq_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        self.labels = [
            torch.tensor(int(label), dtype=torch.long) for label in data.labels
        ]

    def __len__(self):
        return len(self.examples['input_ids'])

    def __getitem__(self, index):
        """
            Use one if the other fails
        """
        answer = {}
        for key in self.examples:
            answer[key] = self.examples[key][index]

        answer['label'] = self.labels[index]

        return answer

In [None]:
torch.cuda.empty_cache() 
import gc
gc.collect() 
from keras import backend as K
K.clear_session()

import wandb
wandb.login()

In [None]:
def fetch_train_valid_testsets(tokenizer, max_seq_length=256, seed=1010):
    '''
    Arguments:
        tokenizer - BERT tokenizer
        max_seq_length - Maximum sequence length
        seed - Random seed

    Returns:
        train_df - Training set (Pandas dataframe)
        valid_df - Validation set (Pandas dataframe)
        test_df - Testing set (Pandas dataframe)
    '''
    '''
    the data in TSV format.
    Contains three colums:
        text_a (previous student sentence);
        text_b (teacher sentence);
        labels (category or TalkMove label)
    '''
    train_data = pd.read_csv(
        '../data/train_teacher.tsv', sep='\t'
    )
    
    valid_data = pd.read_csv(
        '../data/valid_teacher.tsv', sep='\t'
    )
    
    test_data = pd.read_csv(
        '../data/test_teacher.tsv', sep='\t'
    )


    X_train, X_test, y_train, y_test =  train_data, test_data, train_data.labels, test_data.labels
    X_valid, y_valid = valid_data, valid_data.labels
    
    train_df = X_train.replace(np.nan, '', regex=True)
    train_df = TalkBackDataset(train_df, tokenizer, max_seq_length)
    
    valid_df = X_valid.replace(np.nan, '', regex=True)
    valid_df = TalkBackDataset(valid_df, tokenizer, max_seq_length)
    
    test_df = X_test.replace(np.nan, '', regex=True)
    test_df = TalkBackDataset(test_df, tokenizer, max_seq_length)

    return(train_df, valid_df, test_df)


def compute_metrics(pred):
    '''
    Arguments:
        pred - predictions

    Returns:
        accuracy, Micro F1, Macro F1 and MCC scoresoutput_attentions=True,  

    '''
    true_labels = pred.label_ids
    #print(type(pred), pred.predictions.shape, pred.predictions)
    pred_labels = pred.predictions.argmax(-1)
    _, _, mif1 = custom_micro_f1_score(true_labels, pred_labels)
    maf1, _ = custom_macro_f1_score(true_labels, pred_labels)
    acc = accuracy_score(true_labels, pred_labels)
    matthew_corr = matthews_corrcoef(true_labels, pred_labels)

    return {
        'accuracy': acc,
        'micro_f1': mif1,
        'macro_f1': maf1,
        'matthew_corr_coeff': matthew_corr
    }


def test_compute_metrics(pred, model_store):
    _, _, mif1 = custom_micro_f1_score(true_labels, pred_labels)
    maf1, _ = custom_macro_f1_score(true_labels, pred_labels)

    print(confusion_matrix(true_labels, pred_labels))
    #print(save_confusion_matrix(true_labels, pred_labels, model_store, "unseen"))
    print(classification_report(true_labels, pred_labels))
    print(matthews_corrcoef(true_labels, pred_labels))
    return mif1, maf1


#Command line agrument - Folder name where the model outputs can be stored
folder_name = '../../../baseline_public/'

#Command line argument  - Random seed
seed = 1022
model_store = folder_name + "/model_storage/"

#if the model storage folder does not exist, create it
if not os.path.exists(model_store):
    os.makedirs(model_store)

#Log the process to help with debugging. All the logs will be saved in the corresponding folder
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers-" + str(seed))
transformers_logger.setLevel(logging.INFO)

'''
Arguments for model training Including:
    output_dir - Store the outputs of the checkpoints as well as the best model
    learning_rate - learning learning_rate
    num_train_epochs - Number of training epochs (Tuning)
    per_device_train_batch_size - Training batch size
    per_device_eval_batch_size - validation batch size
    warmup_steps - Number of warmup steps
        Warmup steps are just a few updates with low learning rate before / at the beginning of training.
        After this warmup, you use the regular learning rate (schedule) to train your model to convergence.
        The idea that this helps your network to slowly adapt to the data intuitively makes sense.
        However, theoretically, the main reason for warmup steps is to allow adaptive optimisers (e.g. Adam, RMSProp, ...)
        to compute correct statistics of the gradients. Therefore, a warmup period makes little sense when training with plain SGD.
    overwrite_output_dir - Overwrite the output directory
    fp16 - 16 point precision - Speed up the process
    save_steps - save checkpoints to the model output folder
    evaluation_strategy - Evaluation is done at the end of each epoch.
    logging_dir - Folder to save the logs
'''
training_args = TrainingArguments(
    output_dir=model_store,
    learning_rate=2e-5,
    num_train_epochs=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=1000,
    overwrite_output_dir=True,
    fp16=False,
    seed=seed,
    save_steps=30000,
    evaluation_strategy='epoch',
    logging_dir='./logs',
    eval_accumulation_steps=1
)


#Pretrained model to download and use including the number of labels
model = AutoModelForSequenceClassification.from_pretrained(
    'google/electra-small-discriminator', num_labels=7, output_hidden_states=True
)

#Select the tokenizer to use
tokenizer = AutoTokenizer.from_pretrained('google/electra-small-discriminator')

In [None]:
#Fetch the data
train_df, valid_df, test_df = fetch_train_valid_testsets(tokenizer)
print(len(train_df), len(valid_df), len(test_df))

#Set up the DNN model trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= train_df,
    eval_dataset = valid_df,
    compute_metrics=None
)

In [None]:
#Train and evaluate the model
trainer.train()
trainer.evaluate()

In [None]:
model.save_pretrained('../../baseline_public/'+'teacher_electra_base.pth')

In [None]:
val_split = np.array_split(test_df, 20)
pred_labels = []
true_labels = []
for item in val_split:
    val = trainer.predict(test_dataset=item)
    true_labels.extend([id for id in val.label_ids])
    pred_labels.extend([id for id in val.predictions[0].argmax(-1)])
mif1, maf1 = test_compute_metrics(pred_labels, true_labels)
print(mif1, maf1)

In [None]:
import torch.nn as nn
model = AutoModelForSequenceClassification.from_pretrained('../../hf_teacher/'+'teacher_roberta_base.pth', local_files_only=True)
model= nn.DataParallel(model)
model.to('cuda')

In [None]:
training_args = TrainingArguments(
    output_dir=model_store,
    learning_rate=3e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    overwrite_output_dir=True,
    fp16=False,
    seed=seed,
    save_steps=30000,
    evaluation_strategy='epoch',
    logging_dir='./logs',
    eval_accumulation_steps=1
)


#Select the tokenizer to use
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

#Fetch the data
train_df, valid_df, test_df = fetch_train_valid_testsets(tokenizer)
val_split = np.array_split(test_df, 3)

#Set up the DNN model trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=valid_df,
    compute_metrics=None
)

In [None]:
'''
Getting the embeddings of size - 768
Get the one-hot encoding vector - 7

'''

df_split = np.array_split(embed_df, 10)
i = 1
for item in df_split:
    temp = trainer.predict(test_dataset=item)
    with h5py.File('../data/'+str(i)+'_embeddings.h5', 'w') as hf:
        hf.create_dataset("raw_transcripts",  data=temp.predictions[1][12])
    i += 1
    print(temp.predictions[1][12].shape)

'''
import numpy as np

target = np.array(['dog', 'dog', 'cat', 'cat', 'cat', 'dog', 'dog', 
    'cat', 'cat', 'hamster', 'hamster'])

def one_hot(array):
    unique, inverse = np.unique(array, return_inverse=True)
    onehot = np.eye(unique.shape[0])[inverse]
    return onehot

print(one_hot(target))
'''

In [None]:
# Train the new model

class Feedforward(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Feedforward, self).__init__()
        self.input_size = input_size
        self.hidden_size  = hidden_size
        self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(self.hidden_size, 7)
    
    def forward(self, x):
        hidden = self.fc1(x)
        relu = self.relu(hidden)
        output = self.fc2(relu)
        output = F.log_softmax(output, dim=1)
        return output

In [None]:
model = Feedforward(810, 1620)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)

In [None]:
model.train()
epoch = 20
for epoch in range(epoch):
    optimizer.zero_grad()
    # Forward pass
    y_pred = model(x_train)
    # Compute Loss
    loss = criterion(y_pred.squeeze(), y_train)
   
    print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
    # Backward pass
    loss.backward()
    optimizer.step()

In [None]:
val_split = np.array_split(test_df, 20)
pred_labels = []
true_labels = []
for item in val_split:
    val = trainer.predict(test_dataset=item)
    true_labels.extend([id for id in val.label_ids])
    pred_labels.extend([id for id in val.predictions[0].argmax(-1)])
m1f1, maf1 = test_compute_metrics(pred_labels, true_labels)
print(mif1, maf1)