In [None]:
'''
Citation:

[1] “Bert,” BERT. [Online]. Available: https://huggingface.co/docs/transformers/model_doc/bert.
[2] “PyTorch,” torch. [Online]. Available: https://pytorch.org/docs/stable/torch.html.
[3] “Pandas,” pandas. [Online]. Available: https://pandas.pydata.org/.
[4] “SKLearn,” scikit-learn. [Online]. Available: https://scikit-learn.org/stable/.
[5] “Roberta,” RoBERTa. [Online]. Available: https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/roberta#overview.
[6] V. PRASANNA KUMAR and T. Patro, “Bert model with 0.845 accuracy,” Kaggle, 23-Aug-2020. [Online]. Available: https://www.kaggle.com/code/vpkprasanna/bert-model-with-0-845-accuracy/notebook.

'''

In [None]:
# !pip install transformers

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import shutil
import sys 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import time

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Checking the availability og GPU using Pytorch Cuda.

if torch.cuda.is_available():       
    dev = torch.device("cuda")
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    dev = torch.device("cpu")

In [None]:
# train_df = pd.read_csv('/content/drive/MyDrive/Univ-Project/Train.csv',index_col=0)
train_df = pd.read_csv('data/Train.csv',index_col=0)
# test_df = pd.read_csv('/content/drive/MyDrive/Univ-Project/Test.csv',index_col=0)
test_df =  pd.read_csv('data/Test.csv',index_col=0)

In [None]:
X = train_df.CONTEXT.values
y = train_df[list(train_df.columns)[7:]].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2020)
MAX_LEN = 500

In [None]:
from transformers import RobertaTokenizer, RobertaModel

# Loading the RoBERTa tokenizer for tokenizing data making it suitable for RoBERTa Input.

TOK = RobertaTokenizer.from_pretrained("roberta-base",do_lower_case=True)

# Tokenizing texts.
def RoBERT_Preprocess(data):
    inp_ids = []
    a_m = []
    for sent in data:
        encoded_sent = TOK.encode_plus(text = sent,add_special_tokens = True, max_length= MAX_LEN,pad_to_max_length = True,return_attention_mask= True)

        # Adding the outputs to lists
        inp_ids.append(encoded_sent.get('input_ids'))
        a_m.append(encoded_sent.get('attention_mask'))
        
    # Converting the lists to tensors
    inp_ids = torch.tensor(inp_ids)
    a_m = torch.tensor(a_m)
    
    return inp_ids,a_m

In [None]:
# Doing preprocessing for Roberta on the Trainset and Validationset
print('Tokenizing data...')
train_inp, train_m = RoBERT_Preprocess(X_train)
val_inp, val_masks = RoBERT_Preprocess(X_val)

In [None]:
# Fine-tuning RoBerta with a batch size of 16.
batch_size = 16

# Creating the DataLoader for our Trainset
train_data = TensorDataset(train_inp,train_m, torch.tensor(y_train))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Doing the same for Validationset
val_data = TensorDataset(val_inp, val_masks, torch.tensor(y_val))
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:
class Classifier_fn(nn.Module):
    def __init__(self, freeze_bert=False):
        super(Classifier_fn,self).__init__()
        D_in, H,D_out = 768,30,len(y[0])
        self.bert = RobertaModel.from_pretrained("roberta-base")
        
        self.classifier = nn.Sequential(
                            nn.Linear(D_in, H),
                            nn.ReLU(),
                            nn.Linear(H, D_out))
        self.sigmoid = nn.Sigmoid()
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
    
    def forward(self,inp_ids,a_m):
        outputs = self.bert(input_ids=inp_ids,
                           attention_mask = a_m)
        last_hidden_state_cls = outputs[0][:,0,:]
        logit = self.classifier(last_hidden_state_cls)        
        return logit

In [None]:
BCE_loss = nn.BCEWithLogitsLoss()

def train(optimizer,scheduler,model, train_dataloader, val_dataloader, epochs, evaluation,t_l,v_a,v_l,time_elap):
    for epoch_i in range(epochs):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^10} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()

        # Resetting the tracking variables
        total_loss, batch_loss, batch_counts = 0, 0, 0

        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            b_inp_ids, b_attn_mask, b_labels = tuple(t.to(dev) for t in batch)

            model.zero_grad()

            logits = model(b_inp_ids, b_attn_mask)
            loss = BCE_loss(logits, b_labels.float())
            batch_loss += loss.item()
            total_loss += loss.item()

            # Performing Backward pass to calculate gradients
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if (step % 50000 == 0 and step != 0) or (step == len(train_dataloader) - 1):
  
                time_elapsed = time.time() - t0_batch
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Resetting the batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculating the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        if evaluation == True:
            val_loss, val_accuracy = val_test_eval(model, val_dataloader)
            time_elapsed = time.time() - t0_epoch
            time_elap.append(time_elapsed)

            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")

            # Appending Train Loss, Validation Loss and Validation Accuracy to the General List
            t_l.append(avg_train_loss)
            v_a.append(val_accuracy)
            v_l.append(val_loss)

            print("-"*70)
        print("\n")
    return model


def val_test_eval(model, val_dataloader):
    model.eval()
    val_accuracy = []
    val_loss = []
    for batch in val_dataloader:
        # Loading the batch to GPU
        b_inp_ids, b_attn_mask, b_labels = tuple(t.to(dev) for t in batch)

        # Computing logits
        with torch.no_grad():
            logits = model(b_inp_ids, b_attn_mask)

        # Calculating loss
        loss = BCE_loss(logits, b_labels.float())
        val_loss.append(loss.item())
        accuracy = acc_with_thresh(logits.view(-1,len(y[0])),b_labels.view(-1,len(y[0])))
        
        val_accuracy.append(accuracy)
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

def acc_with_thresh(y_pred, y_true, thresh:float=0.5, sigmoid:bool=True):
    if sigmoid: 
        y_pred = y_pred.sigmoid()
    return ((y_pred>thresh)==y_true.byte()).float().mean().item()

In [None]:
def predict_fn(model, test_dataloader):
    model.eval()

    all_logits = []

    for batch in test_dataloader:
        b_inp_ids, b_attn_mask = tuple(t.to(dev) for t in batch)[:2]

        # Computing logits
        with torch.no_grad():
            logits = model(b_inp_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenating logits from each batch
    all_logits = torch.cat(all_logits, dim=0)
    probs = all_logits.sigmoid().cpu().numpy()
    
    return probs

In [None]:
def model_init(train_dataloader,epochs=4):
    # Initiating RoBERTa Classifier
    Robert_classifier = Classifier_fn(freeze_bert=False)
    
    Robert_classifier.to(dev)
    
    # Defining Optimizer Adam with weight decay
    optimizer = AdamW(Robert_classifier.parameters(),lr=5e-5,eps=1e-8)
    
    # Total number of training steps
    total_steps = len(train_dataloader) * epochs
    
    # Setting up the LR scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=total_steps)
    return Robert_classifier, optimizer, scheduler

In [None]:
val_loss = []
train_loss = []
val_acc= []
time_elap = []
torch.cuda.empty_cache()
# Training the RoBerta Classifier on the entire training data
Robert_classifier, optimizer, scheduler = model_init(train_dataloader,epochs=10)
model = train(optimizer,scheduler,Robert_classifier, train_dataloader,val_dataloader,10,True,train_loss,val_acc,val_loss,time_elap)

In [None]:
# Preprocessing for Roberta on the Testset
print('Tokenizing data...')
test_inputs, test_masks = RoBERT_Preprocess(test_df.CONTEXT)
test_labels = torch.tensor(list(test_df[list(test_df.columns)[7:]].values))
test_dataset = TensorDataset(test_inputs, test_masks,test_labels)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset,sampler = test_sampler,batch_size=batch_size)

In [None]:
# Predicting probabilities on the test set
probs = predict_fn(Robert_classifier, test_dataloader)

In [None]:
y_true = test_df[list(test_df.columns)[7:]].values
y_true

In [None]:
output_df = pd.DataFrame(probs, columns = list(train_df.columns)[7:]) 
output_df

In [None]:
y_pred=[]
for sample in  probs:
  y_pred.append([1 if i>=0.3 else 0 for i in sample])
y_pred = np.array(y_pred)
y_pred

In [None]:
# Evaluation
from sklearn.metrics import accuracy_score
from sklearn import metrics

Test_loss, Test_accuracy = evaluate(model, test_dataloader)
Test_loss = "{:.0%}".format(Test_loss)
Test_accuracy = "{:.0%}".format(Test_accuracy)
Prec = "{:.0%}".format(metrics.precision_score(y_true, y_pred, average='micro'))
Recall = "{:.0%}".format( metrics.recall_score(y_true, y_pred, average='micro'))
F1_Score = "{:.0%}".format(metrics.f1_score(y_true, y_pred, average='micro'))

print(Prec)
print(Recall)
print(F1_Score)
print(Test_loss)
print(Test_accuracy)

In [None]:
fin1 = fin1 = pd.DataFrame([['Precision',Prec],['Recall',Recall],['F1-Score',F1_Score],['Test-Accuracy',Test_accuracy],['Test-Loss',Test_loss]],columns=[' ','RoBERTa'])
fin1.to_csv('/content/drive/MyDrive/Univ-Project/Roberta_Eval.csv')
fin1

In [None]:
display = pd.DataFrame(columns=['Epochs','Train_Loss','Validation_Accuracy','Validation_Loss','Time-Elapsed'])
display['Epochs'] = range(1,11)
display['Train_Loss'] = train_loss
display['Validation_Accuracy'] = val_acc
display['Validation_Loss'] = val_loss
display['Time-Elapsed'] = time_elap
display.to_csv('/content/drive/MyDrive/Univ-Project/RoBert_Train_Evaluation.csv')
display

In [None]:
from sklearn.metrics import classification_report
final_report = classification_report(y_true, y_pred,target_names=list(train_df.columns)[7:],output_dict=True)
report_df = pd.DataFrame(final_report).transpose()
report_df = report_df.sort_values(by=['f1-score'], ascending=False)
report_df.to_csv('/content/drive/MyDrive/Univ-Project/RoBert_Report_Label.csv')
report_df