In [1]:
import os
import matplotlib.pyplot as plt
import re
import string
import json
import emoji
import numpy as np
import pandas as pd
from sklearn import metrics
from bs4 import BeautifulSoup
import tqdm
import ktrain
from ktrain import text
import tensorflow as tf
import timeit
import transformers
import contractions
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW, TFAutoModelForSequenceClassification
import gc
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

In [2]:
# Importing train, validation and test datasets with preprocessed texts and labels
train_GE = pd.read_csv("train_clean.csv")
val_GE = pd.read_csv("val_clean.csv")
test_GE = pd.read_csv("test_clean.csv")

# Shape validation
print(train_GE.shape)
print(val_GE.shape)
print(test_GE.shape)

(43410, 29)
(5426, 29)
(5427, 29)


In [3]:
# Loading emotion labels for GoEmotions taxonomy
with open("emotions.txt", "r") as file:
    GE_taxonomy = file.read().split("\n")

GE_taxonomy

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [4]:
GE_taxonomy_no_neu = GE_taxonomy.copy()
GE_taxonomy_no_neu.remove('neutral')

In [5]:
GE_taxonomy_no_neu

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

#### Creating GoEmotion excluding neutral emotion

In [6]:
df_train_GE_no_neu = train_GE.copy()
df_val_GE_no_neu = val_GE.copy()
df_test_GE_no_neu = test_GE.copy()

df_train_GE_no_neu = df_train_GE_no_neu.drop(columns=['neutral'])
df_val_GE_no_neu = df_val_GE_no_neu.drop(columns=['neutral'])
df_test_GE_no_neu = df_test_GE_no_neu.drop(columns=['neutral'])

Then, we need remove all the samples that have been left without a label.

In [7]:
# Removing samples with only 0 in their labels
df_train_GE_no_neu = df_train_GE_no_neu.loc[ df_train_GE_no_neu.apply(lambda x: sum(x[1:]), axis=1)>0 ]
df_val_GE_no_neu = df_val_GE_no_neu.loc[ df_val_GE_no_neu.apply(lambda x: sum(x[1:]), axis=1)>0 ]
df_test_GE_no_neu = df_test_GE_no_neu.loc[ df_test_GE_no_neu.apply(lambda x: sum(x[1:]), axis=1)>0 ]

# Shape validation
print(df_train_GE_no_neu.shape)
print(df_val_GE_no_neu.shape)
print(df_test_GE_no_neu.shape)

(30587, 28)
(3834, 28)
(3821, 28)


In [8]:
df_train_GE_no_neu.head(3)

Unnamed: 0,Clean_text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise
2,why the fuck is bayless isoing,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,to make her feel threatened,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,dirty southern wankers,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
X_train = train_GE[:]["Clean_text"]
y_train = train_GE.loc[:, GE_taxonomy].values.astype(float)
X_train_no_neu = df_train_GE_no_neu[:]["Clean_text"]
y_train_no_neu = df_train_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_val = val_GE[:]["Clean_text"]
y_val = val_GE.loc[:, GE_taxonomy].values.astype(float)
X_val_no_neu = df_val_GE_no_neu[:]["Clean_text"]
y_val_no_neu = df_val_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_test = test_GE[:]["Clean_text"]
y_test = test_GE.loc[:, GE_taxonomy].values.astype(float)
X_test_no_neu = df_test_GE_no_neu[:]["Clean_text"]
y_test_no_neu = df_test_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)
#print(X_train.shape, y_train.shape,y_train_no_neu.shape, X_val.shape, y_val.shape,y_val_no_neu.shape, X_test.shape, y_test.shape, y_test_no_neu.shape)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

(43410,) (43410, 28) (5426,) (5426, 28) (5427,) (5427, 28)


In [10]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_proba.shape[0]):
        for j in range(y_pred_proba.shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [11]:
def get_proba(predictions):
    arr =[]
    for item in predictions:
        prob = []
        for tup in item:
            prob.append(tup[1])
        arr.append(prob)
    return arr


In [12]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

### Instantiating a RoBERTa Instance:

Create a RoBERTa instance with the model name, max token length, the labels to be used for each category and the batch size.

In [6]:
roberta_transformer = text.Transformer('roberta-base', maxlen=56, classes=GE_taxonomy, batch_size=6)

### Perform Data Preprocessing:

In [7]:
roberta_train = roberta_transformer.preprocess_train(X_train.to_list(), y_train)
roberta_val = roberta_transformer.preprocess_test(X_val.to_list(), y_val)
roberta_test = roberta_transformer.preprocess_test(X_test.to_list(), y_test)

preprocessing train...
language: en
train sequence lengths:
	mean : 14
	95percentile : 25
	99percentile : 28


Is Multi-Label? True
preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


preprocessing test...
language: en
test sequence lengths:
	mean : 13
	95percentile : 25
	99percentile : 28


### Compile RoBERTa in a K-Train Learner Object:

Since we are using k-train as a high level abstration package, we need to wrap our model in a k-train Learner Object for further compuation

In [7]:
roberta_model = roberta_transformer.get_classifier()

In [8]:
roberta_learner_ins = ktrain.get_learner(model=roberta_model,
                            train_data=roberta_train,
                            val_data=roberta_val,
                            batch_size=6)

### RoBERTa Model Details:

In [9]:
roberta_learner_ins.model.summary()

Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  612124    
Total params: 124,667,164
Trainable params: 124,667,164
Non-trainable params: 0
_________________________________________________________________


### Find Optimal Learning Rate for RoBERTa:

This is an optional step used just to show how the learning rate can be found for any transformer model.
For Transformer models as per the research papers, the optimal learning rates have already been estimated and established.

In [1]:
rate_finder_start_time = timeit.default_timer()
roberta_learner_ins.lr_find(show_plot=True, max_epochs=3)
rate_finder_stop_time = timeit.default_timer()

print("\nTotal time in minutes on estimating optimal learning rate: \n", (rate_finder_stop_time - rate_finder_start_time)/60)

NameError: name 'timeit' is not defined

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
free_gpu_cache()


### RoBERTa Optimal Learning Rates:

As per the evaluations made in the research paper "**RoBERTa: A Robustly Optimized BERT Pretraining Approach**", below are the best choices in terms of fine-tuning the model:

* Batch Sizes => {16, 32}
* Learning Rates => {1e−5, 2e−5, 3e−5}

We will choose the maximum among these for our fine-tuning and evaluation purposes.

### Fine Tuning RoBERTa on Emotion Dataset:

We take our emotion dataset along with the RoBERTa model, define the learning-rate & epochs to be used and start fine-tuning.

In [10]:
roberta_fine_tune_start_time = timeit.default_timer()
roberta_learner_ins.fit_onecycle(lr=3e-5, epochs=4)
roberta_fine_tune_stop_time = timeit.default_timer()

print("\nTotal time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: \n", (roberta_fine_tune_stop_time - roberta_fine_tune_start_time)/60)



begin training using onecycle policy with max lr of 3e-05...
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

Total time in minutes for Fine-Tuning RoBERTa on Emotion Dataset: 
 40.941396965


### Checking RoBERTa performance metrics:

In [11]:
roberta_learner_ins.validate()

In [12]:
roberta_learner_ins.validate(class_names=GE_taxonomy)

In [13]:
roberta_learner_ins.view_top_losses(preproc=roberta_transformer)

----------
id:4880 | loss:0.6 | true:[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0.   0.   0.   0.01 0.02 0.   0.   0.   0.   0.   0.01 0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.95])

----------
id:3527 | loss:0.58 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0.   0.   0.   0.01 0.02 0.   0.04 0.11 0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.01 0.   0.02 0.   0.   0.   0.01 0.81])

----------
id:1433 | loss:0.57 | true:[0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0.] | pred:[0.   0.   0.01 0.03 0.02 0.   0.   0.   0.   0.01 0.03 0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.9 ])

----------
id:450 | loss:0.53 | true:[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0.] | pred:[0.   0.01 0.   0.01 0.02 0.   0.   0.   0.  

### Saving RoBERTa Model:

In [14]:
roberta_predictor = ktrain.get_predictor(roberta_learner_ins.model, preproc=roberta_transformer)
roberta_predictor.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

### LR - 3e-5, batch size = 6, epochs = 4, maxlen=56

In [15]:
roberta_predictor.save('roberta-emotion-predictor-goemotion-1')

In [16]:
roberta_predictor_2 = ktrain.load_predictor('roberta-emotion-predictor-goemotion-1')
roberta_predictor_2.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [17]:
test_predictions_2 = roberta_predictor_2.predict(X_test.to_list())

In [19]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_proba.shape[0]):
        for j in range(y_pred_proba.shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [20]:
def get_proba(predictions):
    arr =[]
    for item in predictions:
        prob = []
        for tup in item:
            prob.append(tup[1])
        arr.append(prob)
    return arr


In [21]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [20]:
test_proba_2 = get_proba(test_predictions_2)
test_proba_2 = np.array(test_proba_2)
test_proba_2

array([[1.7903112e-02, 8.7302392e-03, 7.7303494e-03, ..., 1.8732747e-01,
        2.5483682e-03, 1.1553922e-02],
       [7.5364554e-01, 5.3642043e-03, 1.6193381e-02, ..., 5.3660641e-03,
        1.1223877e-02, 8.1870584e-03],
       [3.1769121e-01, 1.1598411e-02, 3.0920238e-03, ..., 7.6985470e-04,
        6.0549811e-03, 9.3656573e-03],
       ...,
       [3.0389531e-03, 2.1732687e-03, 7.2334553e-03, ..., 3.0056757e-03,
        1.2960446e-03, 9.6281958e-01],
       [7.3077941e-01, 5.6156782e-03, 9.4018033e-04, ..., 1.9416597e-03,
        4.5370250e-03, 5.3617670e-03],
       [4.3817065e-03, 6.7902119e-03, 9.7010576e-04, ..., 2.5577059e-03,
        1.8391563e-03, 9.2432725e-01]], dtype=float32)

In [21]:
# Generate labels
y_pred_labels_2 = proba_to_labels(test_proba_2)
y_pred_labels_2[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [24]:
# Model evaluation
model_eval(y_test, y_pred_labels_2, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.78,0.56,0.65
amusement,0.84,0.62,0.71
anger,0.67,0.15,0.24
annoyance,1.0,0.02,0.05
approval,0.79,0.17,0.28
caring,0.63,0.19,0.3
confusion,0.68,0.21,0.32
curiosity,0.77,0.06,0.11
desire,0.71,0.18,0.29
disappointment,0.88,0.05,0.09


---

## Model 2

### Instantiating a RoBERTa Instance:

Create a RoBERTa instance with the model name, max token length, the labels to be used for each category and the batch size.

In [5]:
roberta_transformer = text.Transformer('roberta-base', maxlen=48, classes=GE_taxonomy, batch_size=10)

### Perform Data Preprocessing:

In [6]:
roberta_train = roberta_transformer.preprocess_train(X_train.to_list(), y_train)
roberta_val = roberta_transformer.preprocess_test(X_val.to_list(), y_val)
roberta_test = roberta_transformer.preprocess_test(X_test.to_list(), y_test)

preprocessing train...
language: en
train sequence lengths:
	mean : 14
	95percentile : 25
	99percentile : 28


Is Multi-Label? True
preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


preprocessing test...
language: en
test sequence lengths:
	mean : 13
	95percentile : 25
	99percentile : 28


### Compile RoBERTa in a K-Train Learner Object:

Since we are using k-train as a high level abstration package, we need to wrap our model in a k-train Learner Object for further compuation

In [7]:
roberta_model = roberta_transformer.get_classifier()

In [8]:
roberta_learner_ins = ktrain.get_learner(model=roberta_model,
                            train_data=roberta_train,
                            val_data=roberta_val,
                            batch_size=10)

### RoBERTa Model Details:

In [9]:
roberta_learner_ins.model.summary()

Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  612124    
Total params: 124,667,164
Trainable params: 124,667,164
Non-trainable params: 0
_________________________________________________________________


### Find Optimal Learning Rate for RoBERTa:

This is an optional step used just to show how the learning rate can be found for any transformer model.
For Transformer models as per the research papers, the optimal learning rates have already been estimated and established.

In [1]:
rate_finder_start_time = timeit.default_timer()
roberta_learner_ins.lr_find(show_plot=True, max_epochs=3)
rate_finder_stop_time = timeit.default_timer()

print("\nTotal time in minutes on estimating optimal learning rate: \n", (rate_finder_stop_time - rate_finder_start_time)/60)

NameError: name 'timeit' is not defined

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
free_gpu_cache()


### RoBERTa Optimal Learning Rates:

As per the evaluations made in the research paper "**RoBERTa: A Robustly Optimized BERT Pretraining Approach**", below are the best choices in terms of fine-tuning the model:

* Batch Sizes => {16, 32}
* Learning Rates => {1e−5, 2e−5, 3e−5}

We will choose the maximum among these for our fine-tuning and evaluation purposes.

### Fine Tuning RoBERTa on Emotion Dataset:

We take our emotion dataset along with the RoBERTa model, define the learning-rate & epochs to be used and start fine-tuning.

In [10]:
roberta_fine_tune_start_time = timeit.default_timer()
roberta_learner_ins.fit_onecycle(lr=5e-5, epochs=10)
roberta_fine_tune_stop_time = timeit.default_timer()

print("\nTotal time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: \n", (roberta_fine_tune_stop_time - roberta_fine_tune_start_time)/60)



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Total time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: 
 69.88304936


### Checking RoBERTa performance metrics:

In [11]:
roberta_learner_ins.validate()

In [12]:
roberta_learner_ins.validate(class_names=GE_taxonomy)

In [13]:
roberta_learner_ins.view_top_losses(preproc=roberta_transformer)

----------
id:3527 | loss:0.84 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.99])

----------
id:2859 | loss:0.83 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1.])

----------
id:367 | loss:0.76 | true:[0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.99])

----------
id:1399 | loss:0.76 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0. 

### Saving RoBERTa Model:

In [14]:
roberta_predictor = ktrain.get_predictor(roberta_learner_ins.model, preproc=roberta_transformer)
roberta_predictor.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

### LR - 5e-5, batch size = 10, epochs = 10, maxlen=48

In [15]:
roberta_predictor.save('roberta-emotion-predictor-goemotion-2')

In [21]:
roberta_predictor_1 = ktrain.load_predictor('roberta-emotion-predictor-goemotion-2')
roberta_predictor_1.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [17]:
test_predictions_1 = roberta_predictor_1.predict(X_test.to_list())

In [22]:
test_proba_1 = get_proba(test_predictions_1)
test_proba_1 = np.array(test_proba_1)
test_proba_1

array([[3.3536970e-03, 9.2545198e-03, 4.2936229e-03, ..., 4.7977617e-01,
        1.4663014e-03, 2.7643377e-03],
       [9.4114935e-01, 4.5977072e-03, 3.1906173e-03, ..., 2.6671041e-03,
        6.3332159e-04, 4.5039970e-03],
       [6.5519378e-02, 4.1260761e-03, 1.1885009e-03, ..., 7.0284860e-04,
        1.0977413e-02, 3.0193960e-03],
       ...,
       [5.4345466e-04, 4.9529137e-04, 9.6089346e-04, ..., 8.4625959e-04,
        1.5569483e-04, 9.9685174e-01],
       [7.6016629e-01, 1.5074391e-03, 5.4397836e-04, ..., 1.0225406e-03,
        4.0254142e-04, 2.4462803e-03],
       [4.0434580e-04, 7.2131009e-04, 1.4746425e-04, ..., 1.0098093e-01,
        5.4532109e-04, 3.2762828e-01]], dtype=float32)

In [24]:
# Generate labels
y_pred_labels_1 = proba_to_labels(test_proba_1)
y_pred_labels_1[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [25]:
# Model evaluation
model_eval(y_test, y_pred_labels_1, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.7,0.58,0.63
amusement,0.81,0.73,0.77
anger,0.59,0.33,0.43
annoyance,0.44,0.21,0.28
approval,0.42,0.26,0.32
caring,0.54,0.27,0.36
confusion,0.49,0.28,0.36
curiosity,0.53,0.3,0.39
desire,0.63,0.35,0.45
disappointment,0.49,0.17,0.25


In [22]:
# Generate labels
y_pred_labels_3 = proba_to_labels(test_proba_3)
y_pred_labels_3[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [23]:
# Model evaluation
model_eval(y_test, y_pred_labels_3, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.66,0.67,0.67
amusement,0.76,0.8,0.78
anger,0.51,0.35,0.41
annoyance,0.37,0.27,0.31
approval,0.43,0.37,0.4
caring,0.51,0.37,0.43
confusion,0.4,0.4,0.4
curiosity,0.48,0.4,0.44
desire,0.58,0.35,0.44
disappointment,0.39,0.24,0.3


In [23]:
test_predictions_1 = roberta_predictor_1.predict(X_test.to_list())
val_predictions_1 = roberta_predictor_1.predict(X_val.to_list())

In [24]:
test_proba_3 = get_proba(test_predictions_1)
test_proba_3 = np.array(test_proba_3)

val_proba_3 = get_proba(val_predictions_1)
val_proba_3 = np.array(val_proba_3)


In [25]:
# Generate labels
y_pred_labels_test_3 = proba_to_labels(test_proba_3)
y_pred_labels_val_3 = proba_to_labels(val_proba_3)


In [26]:
# Model evaluation
model_eval(y_test, y_pred_labels_test_3, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.7,0.58,0.63
amusement,0.81,0.73,0.77
anger,0.59,0.33,0.43
annoyance,0.44,0.21,0.28
approval,0.42,0.26,0.32
caring,0.54,0.27,0.36
confusion,0.49,0.28,0.36
curiosity,0.53,0.3,0.39
desire,0.63,0.35,0.45
disappointment,0.49,0.17,0.25


In [27]:
# Model evaluation
model_eval(y_val, y_pred_labels_val_3, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.75,0.66,0.7
amusement,0.81,0.73,0.77
anger,0.65,0.41,0.5
annoyance,0.42,0.17,0.25
approval,0.44,0.26,0.33
caring,0.53,0.31,0.39
confusion,0.53,0.32,0.4
curiosity,0.55,0.32,0.41
desire,0.73,0.42,0.53
disappointment,0.58,0.23,0.33


## Model 3 - Final

### Instantiating a RoBERTa Instance:

Create a RoBERTa instance with the model name, max token length, the labels to be used for each category and the batch size.

In [8]:
roberta_transformer = text.Transformer('roberta-base', maxlen=56, classes=GE_taxonomy, batch_size=16)

### Perform Data Preprocessing:

In [9]:
roberta_train = roberta_transformer.preprocess_train(X_train.to_list(), y_train)
roberta_val = roberta_transformer.preprocess_test(X_val.to_list(), y_val)
roberta_test = roberta_transformer.preprocess_test(X_test.to_list(), y_test)

preprocessing train...
language: en
train sequence lengths:
	mean : 14
	95percentile : 25
	99percentile : 28


Is Multi-Label? True
preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


preprocessing test...
language: en
test sequence lengths:
	mean : 13
	95percentile : 25
	99percentile : 28


### Compile RoBERTa in a K-Train Learner Object:

Since we are using k-train as a high level abstration package, we need to wrap our model in a k-train Learner Object for further compuation

In [10]:
roberta_model = roberta_transformer.get_classifier()

In [11]:
roberta_learner_ins = ktrain.get_learner(model=roberta_model,
                            train_data=roberta_train,
                            val_data=roberta_val,
                            batch_size=16)

### RoBERTa Model Details:

In [12]:
roberta_learner_ins.model.summary()

Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  612124    
Total params: 124,667,164
Trainable params: 124,667,164
Non-trainable params: 0
_________________________________________________________________


### Find Optimal Learning Rate for RoBERTa:

This is an optional step used just to show how the learning rate can be found for any transformer model.
For Transformer models as per the research papers, the optimal learning rates have already been estimated and established.

In [1]:
rate_finder_start_time = timeit.default_timer()
roberta_learner_ins.lr_find(show_plot=True, max_epochs=3)
rate_finder_stop_time = timeit.default_timer()

print("\nTotal time in minutes on estimating optimal learning rate: \n", (rate_finder_stop_time - rate_finder_start_time)/60)

NameError: name 'timeit' is not defined

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
free_gpu_cache()


### RoBERTa Optimal Learning Rates:

As per the evaluations made in the research paper "**RoBERTa: A Robustly Optimized BERT Pretraining Approach**", below are the best choices in terms of fine-tuning the model:

* Batch Sizes => {16, 32}
* Learning Rates => {1e−5, 2e−5, 3e−5}

We will choose the maximum among these for our fine-tuning and evaluation purposes.

### Fine Tuning RoBERTa on Emotion Dataset:

We take our emotion dataset along with the RoBERTa model, define the learning-rate & epochs to be used and start fine-tuning.

In [13]:
roberta_fine_tune_start_time = timeit.default_timer()
roberta_learner_ins.fit_onecycle(lr=5e-5, epochs=15)
roberta_fine_tune_stop_time = timeit.default_timer()

print("\nTotal time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: \n", (roberta_fine_tune_stop_time - roberta_fine_tune_start_time)/60)



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

Total time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: 
 92.81196866833334


### Checking RoBERTa performance metrics:

In [14]:
roberta_learner_ins.validate()

In [15]:
roberta_learner_ins.validate(class_names=GE_taxonomy)

In [16]:
roberta_learner_ins.view_top_losses(preproc=roberta_transformer)

----------
id:3527 | loss:0.92 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1.])

----------
id:4880 | loss:0.89 | true:[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1.])

----------
id:601 | loss:0.88 | true:[0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0.] | pred:[0.   0.   0.   0.   0.01 0.   0.   0.01 0.   0.   0.   0.02 0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.99])

----------
id:367 | loss:0.86 | true:[0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1.])



### Saving RoBERTa Model:

In [17]:
roberta_predictor = ktrain.get_predictor(roberta_learner_ins.model, preproc=roberta_transformer)
roberta_predictor.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

### LR - 5e-5, batch size = 16, epochs = 15, maxlen=56

In [18]:
roberta_predictor.save('roberta-emotion-predictor-goemotion-3')

In [10]:
roberta_predictor_3 = ktrain.load_predictor('roberta-emotion-predictor-goemotion-3')
roberta_predictor_3.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [11]:
test_predictions_3 = roberta_predictor_3.predict(X_test.to_list())
val_predictions_3 = roberta_predictor_3.predict(X_val.to_list())

In [21]:
test_proba_3 = get_proba(test_predictions_3)
test_proba_3 = np.array(test_proba_3)
test_proba_3

array([[6.6364049e-03, 1.6514740e-03, 1.1147998e-03, ..., 3.5009545e-01,
        9.1472518e-04, 4.7874972e-03],
       [9.6141309e-01, 8.4600262e-03, 7.5814466e-04, ..., 5.9351407e-04,
        7.8705931e-04, 1.4756176e-03],
       [1.2052274e-01, 2.3990225e-03, 8.7369600e-04, ..., 3.9959120e-04,
        6.5930746e-03, 5.7433435e-04],
       ...,
       [4.6241930e-04, 5.0163601e-04, 5.6524662e-04, ..., 2.9905813e-04,
        8.4501931e-05, 9.9906605e-01],
       [9.6358943e-01, 2.3675426e-03, 4.0471909e-04, ..., 2.9843237e-04,
        1.1355062e-03, 9.4567408e-04],
       [2.3129837e-04, 2.1638017e-04, 1.0896058e-04, ..., 5.6223869e-03,
        2.7416201e-04, 9.9856961e-01]], dtype=float32)

In [22]:
# Generate labels
y_pred_labels_3 = proba_to_labels(test_proba_3)
y_pred_labels_3[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [23]:
# Model evaluation
model_eval(y_test, y_pred_labels_3, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.66,0.67,0.67
amusement,0.76,0.8,0.78
anger,0.51,0.35,0.41
annoyance,0.37,0.27,0.31
approval,0.43,0.37,0.4
caring,0.51,0.37,0.43
confusion,0.4,0.4,0.4
curiosity,0.48,0.4,0.44
desire,0.58,0.35,0.44
disappointment,0.39,0.24,0.3


In [17]:
test_proba_3 = get_proba(test_predictions_3)
test_proba_3 = np.array(test_proba_3)

val_proba_3 = get_proba(val_predictions_3)
val_proba_3 = np.array(val_proba_3)


In [18]:
# Generate labels
y_pred_labels_test_3 = proba_to_labels(test_proba_3)
y_pred_labels_val_3 = proba_to_labels(val_proba_3)


In [19]:
# Model evaluation
model_eval(y_test, y_pred_labels_test_3, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.66,0.67,0.67
amusement,0.76,0.8,0.78
anger,0.51,0.35,0.41
annoyance,0.37,0.27,0.31
approval,0.43,0.37,0.4
caring,0.51,0.37,0.43
confusion,0.4,0.4,0.4
curiosity,0.48,0.4,0.44
desire,0.58,0.35,0.44
disappointment,0.39,0.24,0.3


In [20]:
# Model evaluation
model_eval(y_val, y_pred_labels_val_3, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.7,0.68,0.69
amusement,0.79,0.79,0.79
anger,0.56,0.43,0.49
annoyance,0.36,0.28,0.32
approval,0.43,0.29,0.35
caring,0.51,0.34,0.41
confusion,0.47,0.36,0.41
curiosity,0.52,0.43,0.47
desire,0.54,0.42,0.47
disappointment,0.26,0.15,0.19


## Model 4

### Instantiating a RoBERTa Instance:

Create a RoBERTa instance with the model name, max token length, the labels to be used for each category and the batch size.

In [5]:
roberta_transformer = text.Transformer('roberta-base', maxlen=56, classes=GE_taxonomy, batch_size=16)

### Perform Data Preprocessing:

In [6]:
roberta_train = roberta_transformer.preprocess_train(X_train.to_list(), y_train)
roberta_val = roberta_transformer.preprocess_test(X_val.to_list(), y_val)
roberta_test = roberta_transformer.preprocess_test(X_test.to_list(), y_test)

preprocessing train...
language: en
train sequence lengths:
	mean : 14
	95percentile : 25
	99percentile : 28


Is Multi-Label? True
preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


preprocessing test...
language: en
test sequence lengths:
	mean : 13
	95percentile : 25
	99percentile : 28


### Compile RoBERTa in a K-Train Learner Object:

Since we are using k-train as a high level abstration package, we need to wrap our model in a k-train Learner Object for further compuation

In [7]:
roberta_model = roberta_transformer.get_classifier()

In [8]:
roberta_learner_ins = ktrain.get_learner(model=roberta_model,
                            train_data=roberta_train,
                            val_data=roberta_val,
                            batch_size=16)

### RoBERTa Model Details:

In [9]:
roberta_learner_ins.model.summary()

Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  612124    
Total params: 124,667,164
Trainable params: 124,667,164
Non-trainable params: 0
_________________________________________________________________


### Find Optimal Learning Rate for RoBERTa:

This is an optional step used just to show how the learning rate can be found for any transformer model.
For Transformer models as per the research papers, the optimal learning rates have already been estimated and established.

In [1]:
rate_finder_start_time = timeit.default_timer()
roberta_learner_ins.lr_find(show_plot=True, max_epochs=3)
rate_finder_stop_time = timeit.default_timer()

print("\nTotal time in minutes on estimating optimal learning rate: \n", (rate_finder_stop_time - rate_finder_start_time)/60)

NameError: name 'timeit' is not defined

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
free_gpu_cache()


### RoBERTa Optimal Learning Rates:

As per the evaluations made in the research paper "**RoBERTa: A Robustly Optimized BERT Pretraining Approach**", below are the best choices in terms of fine-tuning the model:

* Batch Sizes => {16, 32}
* Learning Rates => {1e−5, 2e−5, 3e−5}

We will choose the maximum among these for our fine-tuning and evaluation purposes.

### Fine Tuning RoBERTa on Emotion Dataset:

We take our emotion dataset along with the RoBERTa model, define the learning-rate & epochs to be used and start fine-tuning.

In [10]:
roberta_fine_tune_start_time = timeit.default_timer()
roberta_learner_ins.fit_onecycle(lr=5e-5, epochs=20)
roberta_fine_tune_stop_time = timeit.default_timer()

print("\nTotal time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: \n", (roberta_fine_tune_stop_time - roberta_fine_tune_start_time)/60)



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Total time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: 
 123.69208704166665


### Checking RoBERTa performance metrics:

In [11]:
roberta_learner_ins.validate()

In [12]:
roberta_learner_ins.validate(class_names=GE_taxonomy)

In [13]:
roberta_learner_ins.view_top_losses(preproc=roberta_transformer)

----------
id:4880 | loss:0.97 | true:[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1.])

----------
id:367 | loss:0.96 | true:[0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1.])

----------
id:979 | loss:0.94 | true:[1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0.   0.   0.01 0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.01 0.
 0.   0.   0.   0.14 0.   0.   0.   0.9  0.03 0.04 0.   0.   0.   0.03])

----------
id:3201 | loss:0.94 | true:[0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.96 0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.8 ])



### Saving RoBERTa Model:

In [14]:
roberta_predictor = ktrain.get_predictor(roberta_learner_ins.model, preproc=roberta_transformer)
roberta_predictor.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

### LR - 5e-5, batch size = 16, epochs = 20, maxlen=56

In [15]:
roberta_predictor.save('roberta-emotion-predictor-goemotion-4')

In [16]:
roberta_predictor_4 = ktrain.load_predictor('roberta-emotion-predictor-goemotion-4')
roberta_predictor_4.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [17]:
test_predictions_4 = roberta_predictor_4.predict(X_test.to_list())

In [22]:
test_proba_4 = get_proba(test_predictions_4)
test_proba_4 = np.array(test_proba_4)
test_proba_4

array([[8.4290747e-03, 7.8636119e-03, 5.1054446e-04, ..., 9.5501281e-02,
        7.2957424e-04, 7.3105283e-03],
       [8.2433641e-01, 1.9720383e-03, 4.2170432e-04, ..., 5.4175570e-04,
        1.7669980e-04, 1.7695238e-04],
       [1.3946671e-02, 2.2126360e-04, 1.9208476e-03, ..., 1.7582820e-04,
        3.9282176e-03, 4.8753218e-04],
       ...,
       [1.1307964e-04, 9.4718311e-04, 9.8380806e-05, ..., 9.4783976e-04,
        1.2550938e-04, 9.9937904e-01],
       [9.8604071e-01, 8.5227825e-03, 4.0846289e-04, ..., 4.7407774e-04,
        2.6980783e-03, 1.1469327e-04],
       [7.4175063e-05, 2.0628533e-04, 4.7285434e-05, ..., 8.6860557e-04,
        1.7152395e-04, 9.9964035e-01]], dtype=float32)

In [23]:
# Generate labels
y_pred_labels_4 = proba_to_labels(test_proba_4)
y_pred_labels_4[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.], dtype=float32)

In [24]:
# Model evaluation
model_eval(y_test, y_pred_labels_4, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.66,0.69,0.67
amusement,0.76,0.82,0.79
anger,0.49,0.44,0.46
annoyance,0.35,0.3,0.32
approval,0.37,0.29,0.32
caring,0.45,0.39,0.42
confusion,0.42,0.4,0.41
curiosity,0.47,0.46,0.46
desire,0.57,0.35,0.43
disappointment,0.36,0.25,0.29


---

## Model 5

### Instantiating a RoBERTa Instance:

Create a RoBERTa instance with the model name, max token length, the labels to be used for each category and the batch size.

In [28]:
roberta_transformer = text.Transformer('roberta-base', maxlen=48, classes=GE_taxonomy, batch_size=16)

### Perform Data Preprocessing:

In [29]:
roberta_train = roberta_transformer.preprocess_train(X_train.to_list(), y_train)
roberta_val = roberta_transformer.preprocess_test(X_val.to_list(), y_val)
roberta_test = roberta_transformer.preprocess_test(X_test.to_list(), y_test)

preprocessing train...
language: en
train sequence lengths:
	mean : 14
	95percentile : 25
	99percentile : 28


Is Multi-Label? True
preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


preprocessing test...
language: en
test sequence lengths:
	mean : 13
	95percentile : 25
	99percentile : 28


### Compile RoBERTa in a K-Train Learner Object:

Since we are using k-train as a high level abstration package, we need to wrap our model in a k-train Learner Object for further compuation

In [30]:
roberta_model = roberta_transformer.get_classifier()

In [31]:
roberta_learner_ins = ktrain.get_learner(model=roberta_model,
                            train_data=roberta_train,
                            val_data=roberta_val,
                            batch_size=16)

### RoBERTa Model Details:

In [32]:
roberta_learner_ins.model.summary()

Model: "tf_roberta_for_sequence_classification_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  612124    
Total params: 124,667,164
Trainable params: 124,667,164
Non-trainable params: 0
_________________________________________________________________


### Find Optimal Learning Rate for RoBERTa:

This is an optional step used just to show how the learning rate can be found for any transformer model.
For Transformer models as per the research papers, the optimal learning rates have already been estimated and established.

In [1]:
rate_finder_start_time = timeit.default_timer()
roberta_learner_ins.lr_find(show_plot=True, max_epochs=3)
rate_finder_stop_time = timeit.default_timer()

print("\nTotal time in minutes on estimating optimal learning rate: \n", (rate_finder_stop_time - rate_finder_start_time)/60)

NameError: name 'timeit' is not defined

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
free_gpu_cache()


### RoBERTa Optimal Learning Rates:

As per the evaluations made in the research paper "**RoBERTa: A Robustly Optimized BERT Pretraining Approach**", below are the best choices in terms of fine-tuning the model:

* Batch Sizes => {16, 32}
* Learning Rates => {1e−5, 2e−5, 3e−5}

We will choose the maximum among these for our fine-tuning and evaluation purposes.

### Fine Tuning RoBERTa on Emotion Dataset:

We take our emotion dataset along with the RoBERTa model, define the learning-rate & epochs to be used and start fine-tuning.

In [33]:
roberta_fine_tune_start_time = timeit.default_timer()
roberta_learner_ins.fit_onecycle(lr=5e-5, epochs=12)
roberta_fine_tune_stop_time = timeit.default_timer()

print("\nTotal time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: \n", (roberta_fine_tune_stop_time - roberta_fine_tune_start_time)/60)



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12

Total time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: 
 66.71130312833333


### Checking RoBERTa performance metrics:

In [34]:
roberta_learner_ins.validate()

In [35]:
roberta_learner_ins.validate(class_names=GE_taxonomy)

In [36]:
roberta_learner_ins.view_top_losses(preproc=roberta_transformer)

----------
id:1433 | loss:0.91 | true:[0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0.] | pred:[0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.99])

----------
id:4880 | loss:0.83 | true:[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1.])

----------
id:450 | loss:0.78 | true:[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0.] | pred:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1.])

----------
id:2300 | loss:0.78 | true:[0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1.])



### Saving RoBERTa Model:

In [37]:
roberta_predictor = ktrain.get_predictor(roberta_learner_ins.model, preproc=roberta_transformer)
roberta_predictor.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

### LR - 5e-5, batch size = 10, epochs = 10, maxlen=48

In [38]:
roberta_predictor.save('roberta-emotion-predictor-goemotion-5.1')

In [39]:
roberta_predictor_1 = ktrain.load_predictor('roberta-emotion-predictor-goemotion-5.1')
roberta_predictor_1.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [17]:
test_predictions_1 = roberta_predictor_1.predict(X_test.to_list())

In [22]:
test_proba_1 = get_proba(test_predictions_1)
test_proba_1 = np.array(test_proba_1)
test_proba_1

array([[3.3536970e-03, 9.2545198e-03, 4.2936229e-03, ..., 4.7977617e-01,
        1.4663014e-03, 2.7643377e-03],
       [9.4114935e-01, 4.5977072e-03, 3.1906173e-03, ..., 2.6671041e-03,
        6.3332159e-04, 4.5039970e-03],
       [6.5519378e-02, 4.1260761e-03, 1.1885009e-03, ..., 7.0284860e-04,
        1.0977413e-02, 3.0193960e-03],
       ...,
       [5.4345466e-04, 4.9529137e-04, 9.6089346e-04, ..., 8.4625959e-04,
        1.5569483e-04, 9.9685174e-01],
       [7.6016629e-01, 1.5074391e-03, 5.4397836e-04, ..., 1.0225406e-03,
        4.0254142e-04, 2.4462803e-03],
       [4.0434580e-04, 7.2131009e-04, 1.4746425e-04, ..., 1.0098093e-01,
        5.4532109e-04, 3.2762828e-01]], dtype=float32)

In [24]:
# Generate labels
y_pred_labels_1 = proba_to_labels(test_proba_1)
y_pred_labels_1[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [25]:
# Model evaluation
model_eval(y_test, y_pred_labels_1, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.7,0.58,0.63
amusement,0.81,0.73,0.77
anger,0.59,0.33,0.43
annoyance,0.44,0.21,0.28
approval,0.42,0.26,0.32
caring,0.54,0.27,0.36
confusion,0.49,0.28,0.36
curiosity,0.53,0.3,0.39
desire,0.63,0.35,0.45
disappointment,0.49,0.17,0.25


In [22]:
# Generate labels
y_pred_labels_3 = proba_to_labels(test_proba_3)
y_pred_labels_3[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [23]:
# Model evaluation
model_eval(y_test, y_pred_labels_3, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.66,0.67,0.67
amusement,0.76,0.8,0.78
anger,0.51,0.35,0.41
annoyance,0.37,0.27,0.31
approval,0.43,0.37,0.4
caring,0.51,0.37,0.43
confusion,0.4,0.4,0.4
curiosity,0.48,0.4,0.44
desire,0.58,0.35,0.44
disappointment,0.39,0.24,0.3


In [23]:
test_predictions_1 = roberta_predictor_1.predict(X_test.to_list())
val_predictions_1 = roberta_predictor_1.predict(X_val.to_list())

In [24]:
test_proba_3 = get_proba(test_predictions_1)
test_proba_3 = np.array(test_proba_3)

val_proba_3 = get_proba(val_predictions_1)
val_proba_3 = np.array(val_proba_3)


In [25]:
# Generate labels
y_pred_labels_test_3 = proba_to_labels(test_proba_3)
y_pred_labels_val_3 = proba_to_labels(val_proba_3)


In [26]:
# Model evaluation
model_eval(y_test, y_pred_labels_test_3, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.7,0.58,0.63
amusement,0.81,0.73,0.77
anger,0.59,0.33,0.43
annoyance,0.44,0.21,0.28
approval,0.42,0.26,0.32
caring,0.54,0.27,0.36
confusion,0.49,0.28,0.36
curiosity,0.53,0.3,0.39
desire,0.63,0.35,0.45
disappointment,0.49,0.17,0.25


In [27]:
# Model evaluation
model_eval(y_val, y_pred_labels_val_3, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.75,0.66,0.7
amusement,0.81,0.73,0.77
anger,0.65,0.41,0.5
annoyance,0.42,0.17,0.25
approval,0.44,0.26,0.33
caring,0.53,0.31,0.39
confusion,0.53,0.32,0.4
curiosity,0.55,0.32,0.41
desire,0.73,0.42,0.53
disappointment,0.58,0.23,0.33


---

## Model 6

### Instantiating a RoBERTa Instance:

Create a RoBERTa instance with the model name, max token length, the labels to be used for each category and the batch size.

In [40]:
roberta_transformer = text.Transformer('roberta-base', maxlen=48, classes=GE_taxonomy, batch_size=16)

### Perform Data Preprocessing:

In [41]:
roberta_train = roberta_transformer.preprocess_train(X_train.to_list(), y_train)
roberta_val = roberta_transformer.preprocess_test(X_val.to_list(), y_val)
roberta_test = roberta_transformer.preprocess_test(X_test.to_list(), y_test)

preprocessing train...
language: en
train sequence lengths:
	mean : 14
	95percentile : 25
	99percentile : 28


Is Multi-Label? True
preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


preprocessing test...
language: en
test sequence lengths:
	mean : 13
	95percentile : 25
	99percentile : 28


### Compile RoBERTa in a K-Train Learner Object:

Since we are using k-train as a high level abstration package, we need to wrap our model in a k-train Learner Object for further compuation

In [42]:
roberta_model = roberta_transformer.get_classifier()

In [43]:
roberta_learner_ins = ktrain.get_learner(model=roberta_model,
                            train_data=roberta_train,
                            val_data=roberta_val,
                            batch_size=16)

### RoBERTa Model Details:

In [44]:
roberta_learner_ins.model.summary()

Model: "tf_roberta_for_sequence_classification_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  612124    
Total params: 124,667,164
Trainable params: 124,667,164
Non-trainable params: 0
_________________________________________________________________


### Find Optimal Learning Rate for RoBERTa:

This is an optional step used just to show how the learning rate can be found for any transformer model.
For Transformer models as per the research papers, the optimal learning rates have already been estimated and established.

In [1]:
rate_finder_start_time = timeit.default_timer()
roberta_learner_ins.lr_find(show_plot=True, max_epochs=3)
rate_finder_stop_time = timeit.default_timer()

print("\nTotal time in minutes on estimating optimal learning rate: \n", (rate_finder_stop_time - rate_finder_start_time)/60)

NameError: name 'timeit' is not defined

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
free_gpu_cache()


### RoBERTa Optimal Learning Rates:

As per the evaluations made in the research paper "**RoBERTa: A Robustly Optimized BERT Pretraining Approach**", below are the best choices in terms of fine-tuning the model:

* Batch Sizes => {16, 32}
* Learning Rates => {1e−5, 2e−5, 3e−5}

We will choose the maximum among these for our fine-tuning and evaluation purposes.

### Fine Tuning RoBERTa on Emotion Dataset:

We take our emotion dataset along with the RoBERTa model, define the learning-rate & epochs to be used and start fine-tuning.

In [45]:
roberta_fine_tune_start_time = timeit.default_timer()
roberta_learner_ins.fit_onecycle(lr=3e-5, epochs=5)
roberta_fine_tune_stop_time = timeit.default_timer()

print("\nTotal time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: \n", (roberta_fine_tune_stop_time - roberta_fine_tune_start_time)/60)



begin training using onecycle policy with max lr of 3e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Total time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: 
 27.764415828333345


### Checking RoBERTa performance metrics:

In [46]:
roberta_learner_ins.validate()

In [47]:
roberta_learner_ins.validate(class_names=GE_taxonomy)

In [48]:
roberta_learner_ins.view_top_losses(preproc=roberta_transformer)

----------
id:4880 | loss:0.59 | true:[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0.   0.   0.   0.01 0.01 0.   0.   0.   0.   0.   0.01 0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.96])

----------
id:1433 | loss:0.57 | true:[0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0.] | pred:[0.   0.   0.01 0.04 0.02 0.02 0.   0.   0.   0.01 0.02 0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.01 0.   0.02 0.   0.   0.01 0.   0.88])

----------
id:3482 | loss:0.57 | true:[0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0.01 0.01 0.01 0.02 0.02 0.   0.   0.   0.   0.   0.01 0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.94])

----------
id:3527 | loss:0.55 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0.] | pred:[0.   0.   0.   0.02 0.01 0.   0.   0.01 0.

### Saving RoBERTa Model:

In [49]:
roberta_predictor = ktrain.get_predictor(roberta_learner_ins.model, preproc=roberta_transformer)
roberta_predictor.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

### LR - 5e-5, batch size = 10, epochs = 10, maxlen=48

In [50]:
roberta_predictor.save('roberta-emotion-predictor-goemotion-6.1')

In [51]:
roberta_predictor_1 = ktrain.load_predictor('roberta-emotion-predictor-goemotion-6.1')
roberta_predictor_1.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [52]:
test_predictions_1 = roberta_predictor_1.predict(X_test.to_list())
val_predictions_1 = roberta_predictor_1.predict(X_val.to_list())

In [53]:
test_proba_3 = get_proba(test_predictions_1)
test_proba_3 = np.array(test_proba_3)

val_proba_3 = get_proba(val_predictions_1)
val_proba_3 = np.array(val_proba_3)


In [54]:
# Generate labels
y_pred_labels_test_3 = proba_to_labels(test_proba_3)
y_pred_labels_val_3 = proba_to_labels(val_proba_3)


In [55]:
# Model evaluation
model_eval(y_test, y_pred_labels_test_3, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.76,0.56,0.64
amusement,0.84,0.62,0.72
anger,0.74,0.16,0.27
annoyance,0.86,0.04,0.07
approval,0.76,0.18,0.29
caring,0.67,0.21,0.33
confusion,0.62,0.2,0.3
curiosity,0.68,0.09,0.16
desire,0.7,0.17,0.27
disappointment,0.86,0.04,0.08


In [56]:
# Model evaluation
model_eval(y_val, y_pred_labels_val_3, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.8,0.58,0.67
amusement,0.85,0.63,0.72
anger,0.85,0.24,0.37
annoyance,0.82,0.03,0.06
approval,0.69,0.14,0.23
caring,0.73,0.24,0.36
confusion,0.63,0.22,0.33
curiosity,0.78,0.12,0.2
desire,0.84,0.27,0.41
disappointment,1.0,0.01,0.01


### Go Emotion without neutral emotion

## Model 1

X_train = train_GE[:]["Clean_text"]
y_train = train_GE.loc[:, GE_taxonomy].values.astype(float)
X_train_no_neu = df_train_GE_no_neu[:]["Clean_text"]
y_train_no_neu = df_train_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_val = val_GE[:]["Clean_text"]
y_val = val_GE.loc[:, GE_taxonomy].values.astype(float)
X_val_no_neu = df_val_GE_no_neu[:]["Clean_text"]
y_val_no_neu = df_val_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_test = test_GE[:]["Clean_text"]
y_test = test_GE.loc[:, GE_taxonomy].values.astype(float)
X_test_no_neu = df_test_GE_no_neu[:]["Clean_text"]
y_test_no_neu = df_test_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)
#print(X_train.shape, y_train.shape,y_train_no_neu.shape, X_val.shape, y_val.shape,y_val_no_neu.shape, X_test.shape, y_test.shape, y_test_no_neu.shape)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

### Instantiating a RoBERTa Instance:

Create a RoBERTa instance with the model name, max token length, the labels to be used for each category and the batch size.

In [13]:
roberta_transformer = text.Transformer('roberta-base', maxlen=56, classes=GE_taxonomy_no_neu, batch_size=16)

### Perform Data Preprocessing:

In [14]:
roberta_train = roberta_transformer.preprocess_train(X_train_no_neu.to_list(), y_train_no_neu)
roberta_val = roberta_transformer.preprocess_test(X_val_no_neu.to_list(), y_val_no_neu)
roberta_test = roberta_transformer.preprocess_test(X_test_no_neu.to_list(), y_test_no_neu)

preprocessing train...
language: en
train sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


Is Multi-Label? True
preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 25
	99percentile : 28


### Compile RoBERTa in a K-Train Learner Object:

Since we are using k-train as a high level abstration package, we need to wrap our model in a k-train Learner Object for further compuation

In [15]:
roberta_model = roberta_transformer.get_classifier()

In [16]:
roberta_learner_ins = ktrain.get_learner(model=roberta_model,
                            train_data=roberta_train,
                            val_data=roberta_val,
                            batch_size=16)

### RoBERTa Model Details:

In [17]:
roberta_learner_ins.model.summary()

Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  611355    
Total params: 124,666,395
Trainable params: 124,666,395
Non-trainable params: 0
_________________________________________________________________


### Find Optimal Learning Rate for RoBERTa:

This is an optional step used just to show how the learning rate can be found for any transformer model.
For Transformer models as per the research papers, the optimal learning rates have already been estimated and established.

rate_finder_start_time = timeit.default_timer()
roberta_learner_ins.lr_find(show_plot=True, max_epochs=3)
rate_finder_stop_time = timeit.default_timer()

print("\nTotal time in minutes on estimating optimal learning rate: \n", (rate_finder_stop_time - rate_finder_start_time)/60)

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
free_gpu_cache()


### RoBERTa Optimal Learning Rates:

As per the evaluations made in the research paper "**RoBERTa: A Robustly Optimized BERT Pretraining Approach**", below are the best choices in terms of fine-tuning the model:

* Batch Sizes => {16, 32}
* Learning Rates => {1e−5, 2e−5, 3e−5}

We will choose the maximum among these for our fine-tuning and evaluation purposes.

### Fine Tuning RoBERTa on Emotion Dataset:

We take our emotion dataset along with the RoBERTa model, define the learning-rate & epochs to be used and start fine-tuning.

In [18]:
roberta_fine_tune_start_time = timeit.default_timer()
roberta_learner_ins.fit_onecycle(lr=5e-5, epochs=15)
roberta_fine_tune_stop_time = timeit.default_timer()

print("\nTotal time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: \n", (roberta_fine_tune_stop_time - roberta_fine_tune_start_time)/60)



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

Total time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: 
 65.69253288166666


### Checking RoBERTa performance metrics:

In [19]:
roberta_learner_ins.validate()

In [20]:
roberta_learner_ins.validate(class_names=GE_taxonomy)

In [21]:
roberta_learner_ins.view_top_losses(preproc=roberta_transformer)

----------
id:1042 | loss:0.85 | true:[0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0.] | pred:[0.   0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.99 0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ])

----------
id:2959 | loss:0.8 | true:[1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0.] | pred:[0.01 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.  ])

----------
id:1527 | loss:0.74 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0.] | pred:[0.   0.   0.   0.   1.   0.03 0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ])

----------
id:2512 | loss:0.74 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0.] | pred:[0.   0.01 0.   0.   0.   0.   0.09 0.03 0.   0.   0.   0.   0.   0.
 0

### Saving RoBERTa Model:

In [22]:
roberta_predictor = ktrain.get_predictor(roberta_learner_ins.model, preproc=roberta_transformer)
roberta_predictor.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

### LR - 5e-5, batch size = 16, epochs = 15, maxlen=56

In [23]:
roberta_predictor.save('roberta-emotion-predictor-goemotion-no-neu-1')

In [24]:
roberta_predictor_4 = ktrain.load_predictor('roberta-emotion-predictor-goemotion-no-neu-1')
roberta_predictor_4.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

In [25]:
test_predictions_1 = roberta_predictor_4.predict(X_test_no_neu.to_list())
val_predictions_1 = roberta_predictor_4.predict(X_val_no_neu.to_list())

In [26]:
test_proba_3 = get_proba(test_predictions_1)
test_proba_3 = np.array(test_proba_3)

val_proba_3 = get_proba(val_predictions_1)
val_proba_3 = np.array(val_proba_3)


In [27]:
# Generate labels
y_pred_labels_test_3 = proba_to_labels(test_proba_3)
y_pred_labels_val_3 = proba_to_labels(val_proba_3)


In [28]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_3, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.73,0.71,0.72
amusement,0.84,0.85,0.84
anger,0.59,0.43,0.5
annoyance,0.48,0.37,0.42
approval,0.51,0.46,0.48
caring,0.52,0.47,0.49
confusion,0.51,0.48,0.49
curiosity,0.67,0.59,0.63
desire,0.67,0.45,0.54
disappointment,0.43,0.32,0.37


In [29]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_3, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.73,0.74,0.73
amusement,0.85,0.79,0.82
anger,0.6,0.45,0.51
annoyance,0.46,0.38,0.41
approval,0.54,0.45,0.49
caring,0.59,0.5,0.54
confusion,0.53,0.42,0.47
curiosity,0.72,0.61,0.66
desire,0.58,0.45,0.51
disappointment,0.48,0.31,0.37


## Model 2

X_train = train_GE[:]["Clean_text"]
y_train = train_GE.loc[:, GE_taxonomy].values.astype(float)
X_train_no_neu = df_train_GE_no_neu[:]["Clean_text"]
y_train_no_neu = df_train_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_val = val_GE[:]["Clean_text"]
y_val = val_GE.loc[:, GE_taxonomy].values.astype(float)
X_val_no_neu = df_val_GE_no_neu[:]["Clean_text"]
y_val_no_neu = df_val_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_test = test_GE[:]["Clean_text"]
y_test = test_GE.loc[:, GE_taxonomy].values.astype(float)
X_test_no_neu = df_test_GE_no_neu[:]["Clean_text"]
y_test_no_neu = df_test_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)
#print(X_train.shape, y_train.shape,y_train_no_neu.shape, X_val.shape, y_val.shape,y_val_no_neu.shape, X_test.shape, y_test.shape, y_test_no_neu.shape)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

### Instantiating a RoBERTa Instance:

Create a RoBERTa instance with the model name, max token length, the labels to be used for each category and the batch size.

In [13]:
roberta_transformer = text.Transformer('roberta-base', maxlen=56, classes=GE_taxonomy_no_neu, batch_size=16)

### Perform Data Preprocessing:

In [14]:
roberta_train = roberta_transformer.preprocess_train(X_train_no_neu.to_list(), y_train_no_neu)
roberta_val = roberta_transformer.preprocess_test(X_val_no_neu.to_list(), y_val_no_neu)
roberta_test = roberta_transformer.preprocess_test(X_test_no_neu.to_list(), y_test_no_neu)

preprocessing train...
language: en
train sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


Is Multi-Label? True
preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 25
	99percentile : 28


### Compile RoBERTa in a K-Train Learner Object:

Since we are using k-train as a high level abstration package, we need to wrap our model in a k-train Learner Object for further compuation

In [15]:
roberta_model = roberta_transformer.get_classifier()

In [16]:
roberta_learner_ins = ktrain.get_learner(model=roberta_model,
                            train_data=roberta_train,
                            val_data=roberta_val,
                            batch_size=16)

### RoBERTa Model Details:

In [17]:
roberta_learner_ins.model.summary()

Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  611355    
Total params: 124,666,395
Trainable params: 124,666,395
Non-trainable params: 0
_________________________________________________________________


### Find Optimal Learning Rate for RoBERTa:

This is an optional step used just to show how the learning rate can be found for any transformer model.
For Transformer models as per the research papers, the optimal learning rates have already been estimated and established.

rate_finder_start_time = timeit.default_timer()
roberta_learner_ins.lr_find(show_plot=True, max_epochs=3)
rate_finder_stop_time = timeit.default_timer()

print("\nTotal time in minutes on estimating optimal learning rate: \n", (rate_finder_stop_time - rate_finder_start_time)/60)

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
free_gpu_cache()


### RoBERTa Optimal Learning Rates:

As per the evaluations made in the research paper "**RoBERTa: A Robustly Optimized BERT Pretraining Approach**", below are the best choices in terms of fine-tuning the model:

* Batch Sizes => {16, 32}
* Learning Rates => {1e−5, 2e−5, 3e−5}

We will choose the maximum among these for our fine-tuning and evaluation purposes.

### Fine Tuning RoBERTa on Emotion Dataset:

We take our emotion dataset along with the RoBERTa model, define the learning-rate & epochs to be used and start fine-tuning.

In [30]:
roberta_fine_tune_start_time = timeit.default_timer()
roberta_learner_ins.fit_onecycle(lr=5e-5, epochs=20)
roberta_fine_tune_stop_time = timeit.default_timer()

print("\nTotal time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: \n", (roberta_fine_tune_stop_time - roberta_fine_tune_start_time)/60)



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Total time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: 
 87.25766121833334


### Checking RoBERTa performance metrics:

In [31]:
roberta_learner_ins.validate()

In [32]:
roberta_learner_ins.validate(class_names=GE_taxonomy)

In [33]:
roberta_learner_ins.view_top_losses(preproc=roberta_transformer)

----------
id:1042 | loss:1.33 | true:[0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0.] | pred:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.])

----------
id:2959 | loss:1.25 | true:[1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0.] | pred:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0.])

----------
id:1661 | loss:1.06 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0.] | pred:[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.])

----------
id:481 | loss:1.05 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0.] | pred:[1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.   0.   0.   0.  ])



### Saving RoBERTa Model:

In [34]:
roberta_predictor = ktrain.get_predictor(roberta_learner_ins.model, preproc=roberta_transformer)
roberta_predictor.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

### LR - 5e-5, batch size = 16, epochs = 15, maxlen=56

In [35]:
roberta_predictor.save('roberta-emotion-predictor-goemotion-no-neu-2')

In [36]:
roberta_predictor_4 = ktrain.load_predictor('roberta-emotion-predictor-goemotion-no-neu-2')
roberta_predictor_4.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

In [37]:
test_predictions_1 = roberta_predictor_4.predict(X_test_no_neu.to_list())
val_predictions_1 = roberta_predictor_4.predict(X_val_no_neu.to_list())

In [38]:
test_proba_3 = get_proba(test_predictions_1)
test_proba_3 = np.array(test_proba_3)

val_proba_3 = get_proba(val_predictions_1)
val_proba_3 = np.array(val_proba_3)


In [39]:
# Generate labels
y_pred_labels_test_3 = proba_to_labels(test_proba_3)
y_pred_labels_val_3 = proba_to_labels(val_proba_3)


In [40]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_3, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.7,0.74,0.72
amusement,0.82,0.89,0.85
anger,0.59,0.49,0.54
annoyance,0.47,0.38,0.42
approval,0.54,0.45,0.49
caring,0.52,0.45,0.48
confusion,0.51,0.46,0.48
curiosity,0.69,0.63,0.66
desire,0.71,0.47,0.57
disappointment,0.42,0.29,0.34


In [41]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_3, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.7,0.74,0.72
amusement,0.83,0.83,0.83
anger,0.64,0.49,0.56
annoyance,0.42,0.35,0.38
approval,0.56,0.45,0.5
caring,0.61,0.49,0.55
confusion,0.59,0.42,0.49
curiosity,0.65,0.64,0.64
desire,0.61,0.56,0.59
disappointment,0.45,0.31,0.36


## Model 3

X_train = train_GE[:]["Clean_text"]
y_train = train_GE.loc[:, GE_taxonomy].values.astype(float)
X_train_no_neu = df_train_GE_no_neu[:]["Clean_text"]
y_train_no_neu = df_train_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_val = val_GE[:]["Clean_text"]
y_val = val_GE.loc[:, GE_taxonomy].values.astype(float)
X_val_no_neu = df_val_GE_no_neu[:]["Clean_text"]
y_val_no_neu = df_val_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_test = test_GE[:]["Clean_text"]
y_test = test_GE.loc[:, GE_taxonomy].values.astype(float)
X_test_no_neu = df_test_GE_no_neu[:]["Clean_text"]
y_test_no_neu = df_test_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)
#print(X_train.shape, y_train.shape,y_train_no_neu.shape, X_val.shape, y_val.shape,y_val_no_neu.shape, X_test.shape, y_test.shape, y_test_no_neu.shape)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

### Instantiating a RoBERTa Instance:

Create a RoBERTa instance with the model name, max token length, the labels to be used for each category and the batch size.

In [42]:
roberta_transformer = text.Transformer('roberta-base', maxlen=56, classes=GE_taxonomy_no_neu, batch_size=16)

### Perform Data Preprocessing:

In [43]:
roberta_train = roberta_transformer.preprocess_train(X_train_no_neu.to_list(), y_train_no_neu)
roberta_val = roberta_transformer.preprocess_test(X_val_no_neu.to_list(), y_val_no_neu)
roberta_test = roberta_transformer.preprocess_test(X_test_no_neu.to_list(), y_test_no_neu)

preprocessing train...
language: en
train sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


Is Multi-Label? True
preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 25
	99percentile : 28


### Compile RoBERTa in a K-Train Learner Object:

Since we are using k-train as a high level abstration package, we need to wrap our model in a k-train Learner Object for further compuation

In [44]:
roberta_model = roberta_transformer.get_classifier()

In [45]:
roberta_learner_ins = ktrain.get_learner(model=roberta_model,
                            train_data=roberta_train,
                            val_data=roberta_val,
                            batch_size=16)

### RoBERTa Model Details:

In [46]:
roberta_learner_ins.model.summary()

Model: "tf_roberta_for_sequence_classification_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  611355    
Total params: 124,666,395
Trainable params: 124,666,395
Non-trainable params: 0
_________________________________________________________________


### Find Optimal Learning Rate for RoBERTa:

This is an optional step used just to show how the learning rate can be found for any transformer model.
For Transformer models as per the research papers, the optimal learning rates have already been estimated and established.

rate_finder_start_time = timeit.default_timer()
roberta_learner_ins.lr_find(show_plot=True, max_epochs=3)
rate_finder_stop_time = timeit.default_timer()

print("\nTotal time in minutes on estimating optimal learning rate: \n", (rate_finder_stop_time - rate_finder_start_time)/60)

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
free_gpu_cache()


### RoBERTa Optimal Learning Rates:

As per the evaluations made in the research paper "**RoBERTa: A Robustly Optimized BERT Pretraining Approach**", below are the best choices in terms of fine-tuning the model:

* Batch Sizes => {16, 32}
* Learning Rates => {1e−5, 2e−5, 3e−5}

We will choose the maximum among these for our fine-tuning and evaluation purposes.

### Fine Tuning RoBERTa on Emotion Dataset:

We take our emotion dataset along with the RoBERTa model, define the learning-rate & epochs to be used and start fine-tuning.

In [47]:
roberta_fine_tune_start_time = timeit.default_timer()
roberta_learner_ins.fit_onecycle(lr=5e-5, epochs=10)
roberta_fine_tune_stop_time = timeit.default_timer()

print("\nTotal time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: \n", (roberta_fine_tune_stop_time - roberta_fine_tune_start_time)/60)



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Total time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: 
 43.73559640500001


### Checking RoBERTa performance metrics:

In [48]:
roberta_learner_ins.validate()

In [49]:
roberta_learner_ins.validate(class_names=GE_taxonomy)

In [50]:
roberta_learner_ins.view_top_losses(preproc=roberta_transformer)

----------
id:814 | loss:0.68 | true:[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1.
 0. 0. 0.] | pred:[0.35 0.   0.   0.   0.02 0.   0.   0.   0.   0.   0.01 0.   0.   0.
 0.   0.86 0.   0.06 0.   0.   0.   0.   0.   0.02 0.   0.   0.  ])

----------
id:395 | loss:0.68 | true:[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1.] | pred:[0.   0.   0.   0.   0.99 0.   0.01 0.1  0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ])

----------
id:2713 | loss:0.63 | true:[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.] | pred:[0.   0.   0.   0.01 0.01 0.   0.01 0.99 0.   0.   0.   0.   0.   0.01
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.01])

----------
id:1527 | loss:0.62 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0.] | pred:[0.   0.   0.   0.01 0.98 0.   0.   0.   0.   0.01 0.   0.   0.   0.
 

### Saving RoBERTa Model:

In [51]:
roberta_predictor = ktrain.get_predictor(roberta_learner_ins.model, preproc=roberta_transformer)
roberta_predictor.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

### LR - 5e-5, batch size = 16, epochs = 15, maxlen=56

In [52]:
roberta_predictor.save('roberta-emotion-predictor-goemotion-no-neu-3')

In [53]:
roberta_predictor_4 = ktrain.load_predictor('roberta-emotion-predictor-goemotion-no-neu-3')
roberta_predictor_4.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

In [54]:
test_predictions_1 = roberta_predictor_4.predict(X_test_no_neu.to_list())
val_predictions_1 = roberta_predictor_4.predict(X_val_no_neu.to_list())

In [55]:
test_proba_3 = get_proba(test_predictions_1)
test_proba_3 = np.array(test_proba_3)

val_proba_3 = get_proba(val_predictions_1)
val_proba_3 = np.array(val_proba_3)


In [56]:
# Generate labels
y_pred_labels_test_3 = proba_to_labels(test_proba_3)
y_pred_labels_val_3 = proba_to_labels(val_proba_3)


In [57]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_3, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.75,0.7,0.72
amusement,0.85,0.78,0.81
anger,0.65,0.4,0.5
annoyance,0.52,0.28,0.37
approval,0.55,0.43,0.49
caring,0.62,0.37,0.47
confusion,0.59,0.42,0.49
curiosity,0.74,0.53,0.62
desire,0.74,0.37,0.5
disappointment,0.45,0.21,0.29


In [58]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_3, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.76,0.7,0.73
amusement,0.87,0.74,0.8
anger,0.66,0.41,0.5
annoyance,0.47,0.27,0.35
approval,0.61,0.37,0.46
caring,0.68,0.46,0.55
confusion,0.59,0.4,0.48
curiosity,0.74,0.49,0.59
desire,0.67,0.43,0.52
disappointment,0.54,0.28,0.37


## Model 4 - Final

X_train = train_GE[:]["Clean_text"]
y_train = train_GE.loc[:, GE_taxonomy].values.astype(float)
X_train_no_neu = df_train_GE_no_neu[:]["Clean_text"]
y_train_no_neu = df_train_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_val = val_GE[:]["Clean_text"]
y_val = val_GE.loc[:, GE_taxonomy].values.astype(float)
X_val_no_neu = df_val_GE_no_neu[:]["Clean_text"]
y_val_no_neu = df_val_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_test = test_GE[:]["Clean_text"]
y_test = test_GE.loc[:, GE_taxonomy].values.astype(float)
X_test_no_neu = df_test_GE_no_neu[:]["Clean_text"]
y_test_no_neu = df_test_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)
#print(X_train.shape, y_train.shape,y_train_no_neu.shape, X_val.shape, y_val.shape,y_val_no_neu.shape, X_test.shape, y_test.shape, y_test_no_neu.shape)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

### Instantiating a RoBERTa Instance:

Create a RoBERTa instance with the model name, max token length, the labels to be used for each category and the batch size.

In [13]:
roberta_transformer = text.Transformer('roberta-base', maxlen=56, classes=GE_taxonomy_no_neu, batch_size=16)

### Perform Data Preprocessing:

In [14]:
roberta_train = roberta_transformer.preprocess_train(X_train_no_neu.to_list(), y_train_no_neu)
roberta_val = roberta_transformer.preprocess_test(X_val_no_neu.to_list(), y_val_no_neu)
roberta_test = roberta_transformer.preprocess_test(X_test_no_neu.to_list(), y_test_no_neu)

preprocessing train...
language: en
train sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


Is Multi-Label? True
preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 25
	99percentile : 28


### Compile RoBERTa in a K-Train Learner Object:

Since we are using k-train as a high level abstration package, we need to wrap our model in a k-train Learner Object for further compuation

In [15]:
roberta_model = roberta_transformer.get_classifier()

In [16]:
roberta_learner_ins = ktrain.get_learner(model=roberta_model,
                            train_data=roberta_train,
                            val_data=roberta_val,
                            batch_size=16)

### RoBERTa Model Details:

In [17]:
roberta_learner_ins.model.summary()

Model: "tf_roberta_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  611355    
Total params: 124,666,395
Trainable params: 124,666,395
Non-trainable params: 0
_________________________________________________________________


### Find Optimal Learning Rate for RoBERTa:

This is an optional step used just to show how the learning rate can be found for any transformer model.
For Transformer models as per the research papers, the optimal learning rates have already been estimated and established.

rate_finder_start_time = timeit.default_timer()
roberta_learner_ins.lr_find(show_plot=True, max_epochs=3)
rate_finder_stop_time = timeit.default_timer()

print("\nTotal time in minutes on estimating optimal learning rate: \n", (rate_finder_stop_time - rate_finder_start_time)/60)

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
free_gpu_cache()


### RoBERTa Optimal Learning Rates:

As per the evaluations made in the research paper "**RoBERTa: A Robustly Optimized BERT Pretraining Approach**", below are the best choices in terms of fine-tuning the model:

* Batch Sizes => {16, 32}
* Learning Rates => {1e−5, 2e−5, 3e−5}

We will choose the maximum among these for our fine-tuning and evaluation purposes.

### Fine Tuning RoBERTa on Emotion Dataset:

We take our emotion dataset along with the RoBERTa model, define the learning-rate & epochs to be used and start fine-tuning.

In [59]:
roberta_fine_tune_start_time = timeit.default_timer()
roberta_learner_ins.fit_onecycle(lr=3e-5, epochs=10)
roberta_fine_tune_stop_time = timeit.default_timer()

print("\nTotal time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: \n", (roberta_fine_tune_stop_time - roberta_fine_tune_start_time)/60)



begin training using onecycle policy with max lr of 3e-05...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Total time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: 
 43.592460323333334


### Checking RoBERTa performance metrics:

In [60]:
roberta_learner_ins.validate()

In [61]:
roberta_learner_ins.validate(class_names=GE_taxonomy)

In [62]:
roberta_learner_ins.view_top_losses(preproc=roberta_transformer)

----------
id:2959 | loss:0.94 | true:[1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0.] | pred:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0.])

----------
id:1510 | loss:0.88 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0.] | pred:[1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.01
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.01])

----------
id:2713 | loss:0.85 | true:[0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.] | pred:[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.])

----------
id:9 | loss:0.84 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0.] | pred:[0.   0.   0.   0.   1.   0.01 0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.   0.   0.   0.  ])



### Saving RoBERTa Model:

In [63]:
roberta_predictor = ktrain.get_predictor(roberta_learner_ins.model, preproc=roberta_transformer)
roberta_predictor.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

### LR - 5e-5, batch size = 16, epochs = 15, maxlen=56

In [64]:
roberta_predictor.save('roberta-emotion-predictor-goemotion-no-neu-4')

In [65]:
roberta_predictor_4 = ktrain.load_predictor('roberta-emotion-predictor-goemotion-no-neu-4')
roberta_predictor_4.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

In [66]:
test_predictions_1 = roberta_predictor_4.predict(X_test_no_neu.to_list())
val_predictions_1 = roberta_predictor_4.predict(X_val_no_neu.to_list())

In [67]:
test_proba_3 = get_proba(test_predictions_1)
test_proba_3 = np.array(test_proba_3)

val_proba_3 = get_proba(val_predictions_1)
val_proba_3 = np.array(val_proba_3)


In [68]:
# Generate labels
y_pred_labels_test_3 = proba_to_labels(test_proba_3)
y_pred_labels_val_3 = proba_to_labels(val_proba_3)


In [69]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_3, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.72,0.73,0.73
amusement,0.83,0.83,0.83
anger,0.62,0.48,0.54
annoyance,0.44,0.35,0.39
approval,0.52,0.5,0.51
caring,0.54,0.47,0.5
confusion,0.52,0.51,0.51
curiosity,0.68,0.65,0.66
desire,0.74,0.47,0.57
disappointment,0.37,0.25,0.29


In [70]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_3, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.73,0.72,0.73
amusement,0.83,0.79,0.81
anger,0.63,0.49,0.55
annoyance,0.42,0.39,0.41
approval,0.51,0.43,0.46
caring,0.58,0.47,0.52
confusion,0.56,0.48,0.52
curiosity,0.66,0.61,0.63
desire,0.62,0.51,0.56
disappointment,0.43,0.31,0.36


## Model 5

X_train = train_GE[:]["Clean_text"]
y_train = train_GE.loc[:, GE_taxonomy].values.astype(float)
X_train_no_neu = df_train_GE_no_neu[:]["Clean_text"]
y_train_no_neu = df_train_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_val = val_GE[:]["Clean_text"]
y_val = val_GE.loc[:, GE_taxonomy].values.astype(float)
X_val_no_neu = df_val_GE_no_neu[:]["Clean_text"]
y_val_no_neu = df_val_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_test = test_GE[:]["Clean_text"]
y_test = test_GE.loc[:, GE_taxonomy].values.astype(float)
X_test_no_neu = df_test_GE_no_neu[:]["Clean_text"]
y_test_no_neu = df_test_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)
#print(X_train.shape, y_train.shape,y_train_no_neu.shape, X_val.shape, y_val.shape,y_val_no_neu.shape, X_test.shape, y_test.shape, y_test_no_neu.shape)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

### Instantiating a RoBERTa Instance:

Create a RoBERTa instance with the model name, max token length, the labels to be used for each category and the batch size.

In [71]:
roberta_transformer = text.Transformer('roberta-base', maxlen=56, classes=GE_taxonomy_no_neu, batch_size=16)

### Perform Data Preprocessing:

In [72]:
roberta_train = roberta_transformer.preprocess_train(X_train_no_neu.to_list(), y_train_no_neu)
roberta_val = roberta_transformer.preprocess_test(X_val_no_neu.to_list(), y_val_no_neu)
roberta_test = roberta_transformer.preprocess_test(X_test_no_neu.to_list(), y_test_no_neu)

preprocessing train...
language: en
train sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


Is Multi-Label? True
preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 25
	99percentile : 28


### Compile RoBERTa in a K-Train Learner Object:

Since we are using k-train as a high level abstration package, we need to wrap our model in a k-train Learner Object for further compuation

In [73]:
roberta_model = roberta_transformer.get_classifier()

In [74]:
roberta_learner_ins = ktrain.get_learner(model=roberta_model,
                            train_data=roberta_train,
                            val_data=roberta_val,
                            batch_size=16)

### RoBERTa Model Details:

In [75]:
roberta_learner_ins.model.summary()

Model: "tf_roberta_for_sequence_classification_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  611355    
Total params: 124,666,395
Trainable params: 124,666,395
Non-trainable params: 0
_________________________________________________________________


### Find Optimal Learning Rate for RoBERTa:

This is an optional step used just to show how the learning rate can be found for any transformer model.
For Transformer models as per the research papers, the optimal learning rates have already been estimated and established.

rate_finder_start_time = timeit.default_timer()
roberta_learner_ins.lr_find(show_plot=True, max_epochs=3)
rate_finder_stop_time = timeit.default_timer()

print("\nTotal time in minutes on estimating optimal learning rate: \n", (rate_finder_stop_time - rate_finder_start_time)/60)

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
free_gpu_cache()


### RoBERTa Optimal Learning Rates:

As per the evaluations made in the research paper "**RoBERTa: A Robustly Optimized BERT Pretraining Approach**", below are the best choices in terms of fine-tuning the model:

* Batch Sizes => {16, 32}
* Learning Rates => {1e−5, 2e−5, 3e−5}

We will choose the maximum among these for our fine-tuning and evaluation purposes.

### Fine Tuning RoBERTa on Emotion Dataset:

We take our emotion dataset along with the RoBERTa model, define the learning-rate & epochs to be used and start fine-tuning.

In [76]:
roberta_fine_tune_start_time = timeit.default_timer()
roberta_learner_ins.fit_onecycle(lr=3e-5, epochs=15)
roberta_fine_tune_stop_time = timeit.default_timer()

print("\nTotal time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: \n", (roberta_fine_tune_stop_time - roberta_fine_tune_start_time)/60)



begin training using onecycle policy with max lr of 3e-05...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

Total time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: 
 65.67875105833333


### Checking RoBERTa performance metrics:

In [77]:
roberta_learner_ins.validate()

In [78]:
roberta_learner_ins.validate(class_names=GE_taxonomy)

In [79]:
roberta_learner_ins.view_top_losses(preproc=roberta_transformer)

----------
id:814 | loss:0.92 | true:[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1.
 0. 0. 0.] | pred:[0.97 0.   0.   0.   0.05 0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.29 0.   0.01 0.   0.   0.   0.   0.   0.   0.   0.   0.  ])

----------
id:1661 | loss:0.75 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0.] | pred:[1.   0.   0.   0.   0.02 0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ])

----------
id:475 | loss:0.72 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0.] | pred:[0.   0.   0.   0.01 1.   0.   0.   0.   0.01 0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ])

----------
id:1527 | loss:0.72 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0.] | pred:[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

### Saving RoBERTa Model:

In [80]:
roberta_predictor = ktrain.get_predictor(roberta_learner_ins.model, preproc=roberta_transformer)
roberta_predictor.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

### LR - 5e-5, batch size = 16, epochs = 15, maxlen=56

In [81]:
roberta_predictor.save('roberta-emotion-predictor-goemotion-no-neu-5')

In [82]:
roberta_predictor_4 = ktrain.load_predictor('roberta-emotion-predictor-goemotion-no-neu-5')
roberta_predictor_4.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

In [83]:
test_predictions_1 = roberta_predictor_4.predict(X_test_no_neu.to_list())
val_predictions_1 = roberta_predictor_4.predict(X_val_no_neu.to_list())

In [84]:
test_proba_3 = get_proba(test_predictions_1)
test_proba_3 = np.array(test_proba_3)

val_proba_3 = get_proba(val_predictions_1)
val_proba_3 = np.array(val_proba_3)


In [85]:
# Generate labels
y_pred_labels_test_3 = proba_to_labels(test_proba_3)
y_pred_labels_val_3 = proba_to_labels(val_proba_3)


In [86]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_3, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.74,0.71,0.72
amusement,0.83,0.81,0.82
anger,0.6,0.43,0.5
annoyance,0.45,0.34,0.39
approval,0.57,0.44,0.5
caring,0.62,0.48,0.54
confusion,0.51,0.48,0.5
curiosity,0.71,0.57,0.63
desire,0.72,0.41,0.52
disappointment,0.46,0.28,0.35


In [87]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_3, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.76,0.71,0.73
amusement,0.82,0.79,0.81
anger,0.66,0.47,0.55
annoyance,0.47,0.37,0.41
approval,0.59,0.41,0.49
caring,0.62,0.49,0.55
confusion,0.54,0.42,0.47
curiosity,0.69,0.6,0.64
desire,0.67,0.47,0.55
disappointment,0.47,0.28,0.35


## Model 6

X_train = train_GE[:]["Clean_text"]
y_train = train_GE.loc[:, GE_taxonomy].values.astype(float)
X_train_no_neu = df_train_GE_no_neu[:]["Clean_text"]
y_train_no_neu = df_train_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_val = val_GE[:]["Clean_text"]
y_val = val_GE.loc[:, GE_taxonomy].values.astype(float)
X_val_no_neu = df_val_GE_no_neu[:]["Clean_text"]
y_val_no_neu = df_val_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_test = test_GE[:]["Clean_text"]
y_test = test_GE.loc[:, GE_taxonomy].values.astype(float)
X_test_no_neu = df_test_GE_no_neu[:]["Clean_text"]
y_test_no_neu = df_test_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)
#print(X_train.shape, y_train.shape,y_train_no_neu.shape, X_val.shape, y_val.shape,y_val_no_neu.shape, X_test.shape, y_test.shape, y_test_no_neu.shape)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

### Instantiating a RoBERTa Instance:

Create a RoBERTa instance with the model name, max token length, the labels to be used for each category and the batch size.

In [88]:
roberta_transformer = text.Transformer('roberta-base', maxlen=56, classes=GE_taxonomy_no_neu, batch_size=16)

### Perform Data Preprocessing:

In [89]:
roberta_train = roberta_transformer.preprocess_train(X_train_no_neu.to_list(), y_train_no_neu)
roberta_val = roberta_transformer.preprocess_test(X_val_no_neu.to_list(), y_val_no_neu)
roberta_test = roberta_transformer.preprocess_test(X_test_no_neu.to_list(), y_test_no_neu)

preprocessing train...
language: en
train sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


Is Multi-Label? True
preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 26
	99percentile : 28


preprocessing test...
language: en
test sequence lengths:
	mean : 14
	95percentile : 25
	99percentile : 28


### Compile RoBERTa in a K-Train Learner Object:

Since we are using k-train as a high level abstration package, we need to wrap our model in a k-train Learner Object for further compuation

In [90]:
roberta_model = roberta_transformer.get_classifier()

In [91]:
roberta_learner_ins = ktrain.get_learner(model=roberta_model,
                            train_data=roberta_train,
                            val_data=roberta_val,
                            batch_size=16)

### RoBERTa Model Details:

In [92]:
roberta_learner_ins.model.summary()

Model: "tf_roberta_for_sequence_classification_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124055040 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  611355    
Total params: 124,666,395
Trainable params: 124,666,395
Non-trainable params: 0
_________________________________________________________________


### Find Optimal Learning Rate for RoBERTa:

This is an optional step used just to show how the learning rate can be found for any transformer model.
For Transformer models as per the research papers, the optimal learning rates have already been estimated and established.

rate_finder_start_time = timeit.default_timer()
roberta_learner_ins.lr_find(show_plot=True, max_epochs=3)
rate_finder_stop_time = timeit.default_timer()

print("\nTotal time in minutes on estimating optimal learning rate: \n", (rate_finder_stop_time - rate_finder_start_time)/60)

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()
free_gpu_cache()


### RoBERTa Optimal Learning Rates:

As per the evaluations made in the research paper "**RoBERTa: A Robustly Optimized BERT Pretraining Approach**", below are the best choices in terms of fine-tuning the model:

* Batch Sizes => {16, 32}
* Learning Rates => {1e−5, 2e−5, 3e−5}

We will choose the maximum among these for our fine-tuning and evaluation purposes.

### Fine Tuning RoBERTa on Emotion Dataset:

We take our emotion dataset along with the RoBERTa model, define the learning-rate & epochs to be used and start fine-tuning.

In [93]:
roberta_fine_tune_start_time = timeit.default_timer()
roberta_learner_ins.fit_onecycle(lr=3e-5, epochs=20)
roberta_fine_tune_stop_time = timeit.default_timer()

print("\nTotal time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: \n", (roberta_fine_tune_stop_time - roberta_fine_tune_start_time)/60)



begin training using onecycle policy with max lr of 3e-05...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Total time in minutes for Fine-Tuning RoBERTa on GoEmotion Dataset: 
 87.41851243166668


### Checking RoBERTa performance metrics:

In [94]:
roberta_learner_ins.validate()

In [95]:
roberta_learner_ins.validate(class_names=GE_taxonomy)

In [96]:
roberta_learner_ins.view_top_losses(preproc=roberta_transformer)

----------
id:2959 | loss:0.8 | true:[1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0.] | pred:[0.02 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.99 0.   0.   0.   0.   0.   0.   0.   0.  ])

----------
id:2022 | loss:0.8 | true:[0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0.] | pred:[0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.05 0.   0.   0.   0.   0.   0.  ])

----------
id:1661 | loss:0.79 | true:[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0.] | pred:[1.   0.   0.   0.   0.01 0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.  ])

----------
id:520 | loss:0.78 | true:[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0.] | pred:[0.   0.   0.   0.01 0.   0.   1.   0.01 0.   0.01 0.01 0.   0.   0.
 0.0

### Saving RoBERTa Model:

In [97]:
roberta_predictor = ktrain.get_predictor(roberta_learner_ins.model, preproc=roberta_transformer)
roberta_predictor.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

### LR - 5e-5, batch size = 16, epochs = 15, maxlen=56

In [99]:
roberta_predictor.save('roberta-emotion-predictor-goemotion-no-neu-6')

In [100]:
roberta_predictor_4 = ktrain.load_predictor('roberta-emotion-predictor-goemotion-no-neu-6')
roberta_predictor_4.get_classes()

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

In [101]:
test_predictions_1 = roberta_predictor_4.predict(X_test_no_neu.to_list())
val_predictions_1 = roberta_predictor_4.predict(X_val_no_neu.to_list())

In [102]:
test_proba_3 = get_proba(test_predictions_1)
test_proba_3 = np.array(test_proba_3)

val_proba_3 = get_proba(val_predictions_1)
val_proba_3 = np.array(val_proba_3)


In [103]:
# Generate labels
y_pred_labels_test_3 = proba_to_labels(test_proba_3)
y_pred_labels_val_3 = proba_to_labels(val_proba_3)


In [104]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_3, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.71,0.73,0.72
amusement,0.82,0.87,0.85
anger,0.54,0.44,0.48
annoyance,0.47,0.35,0.4
approval,0.5,0.47,0.49
caring,0.56,0.47,0.51
confusion,0.51,0.52,0.51
curiosity,0.65,0.6,0.62
desire,0.7,0.51,0.59
disappointment,0.45,0.27,0.34


In [105]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_3, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.72,0.73,0.73
amusement,0.81,0.8,0.81
anger,0.56,0.47,0.51
annoyance,0.45,0.36,0.4
approval,0.52,0.43,0.47
caring,0.59,0.5,0.54
confusion,0.55,0.48,0.51
curiosity,0.69,0.65,0.67
desire,0.6,0.48,0.53
disappointment,0.46,0.28,0.35
