Final Thesis Coding

!pip install emoji

!pip install contractions

!pip install transformers

!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116

!pip install wordcloud

In [1]:
# Data manipulation libraries
import sys, os
import pandas as pd
import numpy as np
import json

import emoji
import contractions
import re

# Scikit-learn packages
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# Packages to define a BERT model
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from transformers import AutoConfig, AutoModel, BertConfig, BertTokenizerFast, TFBertModel

# from tqdm.auto import tqdm
import tqdm
import torch
from torch.autograd import Variable
import string
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adadelta
import torch.nn.functional as F

# packages for visualization
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

from wordcloud import WordCloud
from IPython.display import Image
from IPython import display


%matplotlib inline
%config InlineBackend.figure_format='retina'

import itertools
import math

RANDOM_SEED = 42

In [2]:
import random
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONSOUJANYA'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Some cudnn methods can be random even after fixing the seed 
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

seed_everything()

## 1.2 - Loading datasets and lists of emotions

First, let's load our clean data.

In [3]:
# Importing train, validation and test datasets with preprocessed texts and labels
train_GE = pd.read_csv("train_clean.csv")
val_GE = pd.read_csv("val_clean.csv")
test_GE = pd.read_csv("test_clean.csv")

# Shape validation
print(train_GE.shape)
print(val_GE.shape)
print(test_GE.shape)


(43410, 29)
(5426, 29)
(5427, 29)


In [4]:
train_GE_Sampled = train_GE
val_GE_Sampled = val_GE
test_GE_Sampled = test_GE

# Shape validation
print(train_GE_Sampled.shape)
print(val_GE_Sampled.shape)
print(test_GE_Sampled.shape)

(43410, 29)
(5426, 29)
(5427, 29)


In [5]:
target_cols = [
 'admiration', 
    'amusement', 
    'anger', 
    'annoyance', 
    'approval', 
    'caring', 
    'confusion', 
    'curiosity', 
    'desire', 
    'disappointment', 
    'disapproval', 
    'disgust', 
    'embarrassment', 
    'excitement', 
    'fear', 
    'gratitude', 
    'grief', 
    'joy', 
    'love', 
    'nervousness', 
    'optimism', 
    'pride', 
    'realization', 
    'relief', 
    'remorse', 
    'sadness', 
    'surprise',
    'neutral'
]

In [6]:
train_GE_Sampled = train_GE_Sampled.rename(columns={'Clean_text': 'text'})
val_GE_Sampled = val_GE_Sampled.rename(columns={'Clean_text': 'text'})
test_GE_Sampled = test_GE_Sampled.rename(columns={'Clean_text': 'text'})


In [7]:
# Loading emotion labels for GoEmotions taxonomy
with open("emotions.txt", "r") as file:
    GE_taxonomy = file.read().split("\n")

print("Emotions on GoEmotions taxonomy are : \n{}".format(GE_taxonomy))

print()

# Loading emotion labels for Ekman taxonomy
with open("ekman_labels.txt", "r") as file:
    Ekman_taxonomy = file.read().split("\n")

print("Emotions on Ekman taxonomy are : \n{}".format(Ekman_taxonomy))

Emotions on GoEmotions taxonomy are : 
['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

Emotions on Ekman taxonomy are : 
['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']


In [8]:
GE_taxonomy

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [9]:
GE_taxonomy_no_neu = GE_taxonomy.copy()
GE_taxonomy_no_neu.remove('neutral')

In [10]:
GE_taxonomy_no_neu

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise']

sampled_train_df = train_GE_Sampled.sample(frac=1)

len(train_GE_Sampled), len(sampled_train_df)

sampled_train_df.head()

print(sampled_train_df.shape)
print(test_GE.shape)

In [11]:
sampled_train_df = train_GE_Sampled[['text', *target_cols]]

In [12]:
sampled_train_df

Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,my favourite food is anything i did not have t...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,now if he does off himself everyone will think...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,why the fuck is bayless isoing,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,to make her feel threatened,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,dirty southern wankers,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43405,added you mate well i have just got the bow an...,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
43406,always thought that was funny but is it a refe...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
43407,what are you talking about ? anything bad that...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43408,more like a baptism with sexy results !,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Creating GoEmotion excluding neutral emotion

In [13]:
df_train_GE_no_neu = sampled_train_df.copy()
df_val_GE_no_neu = val_GE_Sampled.copy()
df_test_GE_no_neu = test_GE_Sampled.copy()

df_train_GE_no_neu = df_train_GE_no_neu.drop(columns=['neutral'])
df_val_GE_no_neu = df_val_GE_no_neu.drop(columns=['neutral'])
df_test_GE_no_neu = df_test_GE_no_neu.drop(columns=['neutral'])

Then, we need remove all the samples that have been left without a label.

In [14]:
# Removing samples with only 0 in their labels
df_train_GE_no_neu = df_train_GE_no_neu.loc[ df_train_GE_no_neu.apply(lambda x: sum(x[1:]), axis=1)>0 ]
df_val_GE_no_neu = df_val_GE_no_neu.loc[ df_val_GE_no_neu.apply(lambda x: sum(x[1:]), axis=1)>0 ]
df_test_GE_no_neu = df_test_GE_no_neu.loc[ df_test_GE_no_neu.apply(lambda x: sum(x[1:]), axis=1)>0 ]

# Shape validation
print(df_train_GE_no_neu.shape)
print(df_val_GE_no_neu.shape)
print(df_test_GE_no_neu.shape)

(30587, 28)
(3834, 28)
(3821, 28)


In [15]:
# Creating train, validation and test variables
X_train = train_GE_Sampled['text']
y_train = train_GE_Sampled.loc[:, GE_taxonomy].values.astype(float)
X_train_no_neu = df_train_GE_no_neu['text']
y_train_no_neu = df_train_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_val = val_GE_Sampled['text']
y_val = val_GE_Sampled.loc[:, GE_taxonomy].values.astype(float)
X_val_no_neu = df_val_GE_no_neu['text']
y_val_no_neu = df_val_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

X_test = test_GE_Sampled['text']
y_test = test_GE_Sampled.loc[:, GE_taxonomy].values.astype(float)
X_test_no_neu = df_test_GE_no_neu['text']
y_test_no_neu = df_test_GE_no_neu.loc[:, GE_taxonomy_no_neu].values.astype(float)

In [16]:
df_train_GE_no_neu.head(3)

Unnamed: 0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise
2,why the fuck is bayless isoing,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,to make her feel threatened,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,dirty southern wankers,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Creating Ekman lables

In [17]:
df_train = pd.read_csv('https://github.com/google-research/google-research/raw/master/goemotions/data/train.tsv', sep='\t', header=None, names=['Text', 'Class', 'ID']).drop('ID', axis=1)
df_val = pd.read_csv('https://github.com/google-research/google-research/raw/master/goemotions/data/dev.tsv', sep='\t', header=None, names=['Text', 'Class', 'ID']).drop('ID', axis=1)
df_test = pd.read_csv('https://github.com/google-research/google-research/raw/master/goemotions/data/test.tsv', sep='\t', header=None, names=['Text', 'Class', 'ID']).drop('ID', axis=1)


In [18]:
df_train

Unnamed: 0,Text,Class
0,My favourite food is anything I didn't have to...,27
1,"Now if he does off himself, everyone will thin...",27
2,WHY THE FUCK IS BAYLESS ISOING,2
3,To make her feel threatened,14
4,Dirty Southern Wankers,3
...,...,...
43405,Added you mate well I’ve just got the bow and ...,18
43406,Always thought that was funny but is it a refe...,6
43407,What are you talking about? Anything bad that ...,3
43408,"More like a baptism, with sexy results!",13


In [19]:
df_train['List of classes'] = df_train['Class'].apply(lambda x: x.split(','))
df_train['Len of classes'] = df_train['List of classes'].apply(lambda x: len(x))
df_val['List of classes'] = df_val['Class'].apply(lambda x: x.split(','))
df_val['Len of classes'] = df_val['List of classes'].apply(lambda x: len(x))
df_test['List of classes'] = df_test['Class'].apply(lambda x: x.split(','))
df_test['Len of classes'] = df_test['List of classes'].apply(lambda x: len(x))

In [20]:
df_train

Unnamed: 0,Text,Class,List of classes,Len of classes
0,My favourite food is anything I didn't have to...,27,[27],1
1,"Now if he does off himself, everyone will thin...",27,[27],1
2,WHY THE FUCK IS BAYLESS ISOING,2,[2],1
3,To make her feel threatened,14,[14],1
4,Dirty Southern Wankers,3,[3],1
...,...,...,...,...
43405,Added you mate well I’ve just got the bow and ...,18,[18],1
43406,Always thought that was funny but is it a refe...,6,[6],1
43407,What are you talking about? Anything bad that ...,3,[3],1
43408,"More like a baptism, with sexy results!",13,[13],1


In [21]:
with open('ekman_mapping.json') as file:
    ekman_mapping = json.load(file)

In [22]:
emotion_file = open("emotions.txt", "r")
emotion_list = emotion_file.read()
emotion_list = emotion_list.split("\n")
print(emotion_list)

['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [23]:
def idx2class(idx_list):
    arr = []
    for i in idx_list:
        arr.append(emotion_list[int(i)])
    return arr

In [24]:
df_train['Emotions'] = df_train['List of classes'].apply(idx2class)
df_val['Emotions'] = df_val['List of classes'].apply(idx2class)
df_test['Emotions'] = df_test['List of classes'].apply(idx2class)

In [25]:
ekman_mapping.update({'neutral':['neutral']})

In [26]:
ekman_mapping

{'anger': ['anger', 'annoyance', 'disapproval'],
 'disgust': ['disgust'],
 'fear': ['fear', 'nervousness'],
 'joy': ['joy',
  'amusement',
  'approval',
  'excitement',
  'gratitude',
  'love',
  'optimism',
  'relief',
  'pride',
  'admiration',
  'desire',
  'caring'],
 'sadness': ['sadness', 'disappointment', 'embarrassment', 'grief', 'remorse'],
 'surprise': ['surprise', 'realization', 'confusion', 'curiosity'],
 'neutral': ['neutral']}

In [27]:
def EmotionMapping(emotion_list):
    map_list = []
    
    for i in emotion_list:
        if i in ekman_mapping['anger']:
            map_list.append('anger')
        if i in ekman_mapping['disgust']:
            map_list.append('disgust')
        if i in ekman_mapping['fear']:
            map_list.append('fear')
        if i in ekman_mapping['joy']:
            map_list.append('joy')
        if i in ekman_mapping['sadness']:
            map_list.append('sadness')
        if i in ekman_mapping['surprise']:
            map_list.append('surprise')
        if i in ekman_mapping['neutral']:
            map_list.append('neutral')
        
            
    return map_list

In [28]:
df_train['Mapped Emotions'] = df_train['Emotions'].apply(EmotionMapping)
df_val['Mapped Emotions'] = df_val['Emotions'].apply(EmotionMapping)
df_test['Mapped Emotions'] = df_test['Emotions'].apply(EmotionMapping)

In [29]:
df_train.head(3)

Unnamed: 0,Text,Class,List of classes,Len of classes,Emotions,Mapped Emotions
0,My favourite food is anything I didn't have to...,27,[27],1,[neutral],[neutral]
1,"Now if he does off himself, everyone will thin...",27,[27],1,[neutral],[neutral]
2,WHY THE FUCK IS BAYLESS ISOING,2,[2],1,[anger],[anger]


In [30]:
# OneHot encoding for multi-label classification
for emo in ekman_mapping:
    df_train[emo] = np.zeros((len(df_train),1))
    df_train[emo] = df_train['Mapped Emotions'].apply(lambda x: 1 if emo in x else 0)

for emo in ekman_mapping:
    df_val[emo] = np.zeros((len(df_val),1))
    df_val[emo] = df_val['Mapped Emotions'].apply(lambda x: 1 if emo in x else 0)    

for emo in ekman_mapping:
    df_test[emo] = np.zeros((len(df_test),1))
    df_test[emo] = df_test['Mapped Emotions'].apply(lambda x: 1 if emo in x else 0)



In [31]:
df_train.head(3)

Unnamed: 0,Text,Class,List of classes,Len of classes,Emotions,Mapped Emotions,anger,disgust,fear,joy,sadness,surprise,neutral
0,My favourite food is anything I didn't have to...,27,[27],1,[neutral],[neutral],0,0,0,0,0,0,1
1,"Now if he does off himself, everyone will thin...",27,[27],1,[neutral],[neutral],0,0,0,0,0,0,1
2,WHY THE FUCK IS BAYLESS ISOING,2,[2],1,[anger],[anger],1,0,0,0,0,0,0


In [32]:
df_train.drop(['Class','List of classes','Len of classes','Emotions','Mapped Emotions'], axis=1, inplace=True)
df_val.drop(['Class','List of classes','Len of classes','Emotions','Mapped Emotions'], axis=1, inplace=True)
df_test.drop(['Class','List of classes','Len of classes','Emotions','Mapped Emotions'], axis=1, inplace=True)


df_train_no_neu = df_train.copy()
df_val_no_neu = df_val.copy()
df_test_no_neu = df_test.copy()

df_train_no_neu = df_train_no_neu.drop(columns=['neutral'])
df_val_no_neu = df_val_no_neu.drop(columns=['neutral'])
df_test_no_neu = df_test_no_neu.drop(columns=['neutral'])

Then, we need remove all the samples that have been left without a label.

In [33]:
# Removing samples with only 0 in their labels
df_train_no_neu = df_train_no_neu.loc[ df_train_no_neu.apply(lambda x: sum(x[1:]), axis=1)>0 ]
df_val_no_neu = df_val_no_neu.loc[ df_val_no_neu.apply(lambda x: sum(x[1:]), axis=1)>0 ]
df_test_no_neu = df_test_no_neu.loc[ df_test_no_neu.apply(lambda x: sum(x[1:]), axis=1)>0 ]

# Shape validation
print(df_train_no_neu.shape)
print(df_train_no_neu.shape)
print(df_train_no_neu.shape)

(30587, 7)
(30587, 7)
(30587, 7)


In [34]:
df_train_no_neu.head(3)

Unnamed: 0,Text,anger,disgust,fear,joy,sadness,surprise
2,WHY THE FUCK IS BAYLESS ISOING,1,0,0,0,0,0
3,To make her feel threatened,0,0,1,0,0,0
4,Dirty Southern Wankers,1,0,0,0,0,0


In [35]:
# Building a preprocessing function to clean text
def preprocess_corpus(x):

  # Adding a space between words and punctation
  x = re.sub( r'([a-zA-Z\[\]])([,;.!?])', r'\1 \2', x)
  x = re.sub( r'([,;.!?])([a-zA-Z\[\]])', r'\1 \2', x)
  
  # Demojize
  x = emoji.demojize(x)
  
  # Expand contraction
  x = contractions.fix(x)
  
  # Lower
  x = x.lower()

  #correct some acronyms/typos/abbreviations  
  x = re.sub(r"lmao", "laughing my ass off", x)  
  x = re.sub(r"amirite", "am i right", x)
  x = re.sub(r"\b(tho)\b", "though", x)
  x = re.sub(r"\b(ikr)\b", "i know right", x)
  x = re.sub(r"\b(ya|u)\b", "you", x)
  x = re.sub(r"\b(eu)\b", "europe", x)
  x = re.sub(r"\b(da)\b", "the", x)
  x = re.sub(r"\b(dat)\b", "that", x)
  x = re.sub(r"\b(dats)\b", "that is", x)
  x = re.sub(r"\b(cuz)\b", "because", x)
  x = re.sub(r"\b(fkn)\b", "fucking", x)
  x = re.sub(r"\b(tbh)\b", "to be honest", x)
  x = re.sub(r"\b(tbf)\b", "to be fair", x)
  x = re.sub(r"faux pas", "mistake", x)
  x = re.sub(r"\b(btw)\b", "by the way", x)
  x = re.sub(r"\b(bs)\b", "bullshit", x)
  x = re.sub(r"\b(kinda)\b", "kind of", x)
  x = re.sub(r"\b(bruh)\b", "bro", x)
  x = re.sub(r"\b(w/e)\b", "whatever", x)
  x = re.sub(r"\b(w/)\b", "with", x)
  x = re.sub(r"\b(w/o)\b", "without", x)
  x = re.sub(r"\b(doj)\b", "department of justice", x)
  
  #replace some words with multiple occurences of a letter, example "coooool" turns into --> cool
  x = re.sub(r"\b(j+e{2,}z+e*)\b", "jeez", x)
  x = re.sub(r"\b(co+l+)\b", "cool", x)
  x = re.sub(r"\b(g+o+a+l+)\b", "goal", x)
  x = re.sub(r"\b(s+h+i+t+)\b", "shit", x)
  x = re.sub(r"\b(o+m+g+)\b", "omg", x)
  x = re.sub(r"\b(w+t+f+)\b", "wtf", x)
  x = re.sub(r"\b(w+h+a+t+)\b", "what", x)
  x = re.sub(r"\b(y+e+y+|y+a+y+|y+e+a+h+)\b", "yeah", x)
  x = re.sub(r"\b(w+o+w+)\b", "wow", x)
  x = re.sub(r"\b(w+h+y+)\b", "why", x)
  x = re.sub(r"\b(s+o+)\b", "so", x)
  x = re.sub(r"\b(f)\b", "fuck", x)
  x = re.sub(r"\b(w+h+o+p+s+)\b", "whoops", x)
  x = re.sub(r"\b(ofc)\b", "of course", x)
  x = re.sub(r"\b(the us)\b", "usa", x)
  x = re.sub(r"\b(gf)\b", "girlfriend", x)
  x = re.sub(r"\b(hr)\b", "human ressources", x)
  x = re.sub(r"\b(mh)\b", "mental health", x)
  x = re.sub(r"\b(idk)\b", "i do not know", x)
  x = re.sub(r"\b(gotcha)\b", "i got you", x)
  x = re.sub(r"\b(y+e+p+)\b", "yes", x)
  x = re.sub(r"\b(a*ha+h[ha]*|a*ha +h[ha]*)\b", "haha", x)
  x = re.sub(r"\b(o?l+o+l+[ol]*)\b", "lol", x)
  x = re.sub(r"\b(o*ho+h[ho]*|o*ho +h[ho]*)\b", "ohoh", x)
  x = re.sub(r"\b(o+h+)\b", "oh", x)
  x = re.sub(r"\b(a+h+)\b", "ah", x)
  x = re.sub(r"\b(u+h+)\b", "uh", x)

  # Handling emojis
  x = re.sub(r"<3", " love ", x)
  x = re.sub(r"xd", " smiling_face_with_open_mouth_and_tightly_closed_eyes ", x)
  x = re.sub(r":\)", " smiling_face ", x)
  x = re.sub(r"^_^", " smiling_face ", x)
  x = re.sub(r"\*_\*", " star_struck ", x)
  x = re.sub(r":\(", " frowning_face ", x)
  x = re.sub(r":\^\(", " frowning_face ", x)
  x = re.sub(r";\(", " frowning_face ", x)
  x = re.sub(r":\/",  " confused_face", x)
  x = re.sub(r";\)",  " wink", x)
  x = re.sub(r">__<",  " unamused ", x)
  x = re.sub(r"\b([xo]+x*)\b", " xoxo ", x)
  x = re.sub(r"\b(n+a+h+)\b", "no", x)

  # Handling special cases of text
  x = re.sub(r"h a m b e r d e r s", "hamberders", x)
  x = re.sub(r"b e n", "ben", x)
  x = re.sub(r"s a t i r e", "satire", x)
  x = re.sub(r"y i k e s", "yikes", x)
  x = re.sub(r"s p o i l e r", "spoiler", x)
  x = re.sub(r"thankyou", "thank you", x)
  x = re.sub(r"a^r^o^o^o^o^o^o^o^n^d", "around", x)

  # Remove special characters and numbers replace by space + remove double space
  x = re.sub(r"\b([.]{3,})"," dots ", x)
  x = re.sub(r"[^A-Za-z!?_]+"," ", x)
  x = re.sub(r"\b([s])\b *","", x)
  x = re.sub(r" +"," ", x)
  x = x.strip()

  return x

In [36]:
# Defining the number of samples in train, validation and test dataset
size_train = df_train.shape[0]
size_val = df_val.shape[0]
size_test = df_test.shape[0]

# Defining the total number of samples
size_all = size_train + size_val + size_test

size_train

43410

In [37]:
# Shape of train, validation and test datasets
print("Train dataset has {} samples and represents {:.2f}% of overall data".format(size_train, size_train/size_all*100))
print("Validation dataset has {} samples and represents {:.2f}% of overall data".format(size_val, size_val/size_all*100))
print("Test dataset has {} samples and represents {:.2f}% of overall data".format(size_test, size_test/size_all*100))
print()
print("The total number of samples is : {}".format(size_all))

Train dataset has 43410 samples and represents 80.00% of overall data
Validation dataset has 5426 samples and represents 10.00% of overall data
Test dataset has 5427 samples and represents 10.00% of overall data

The total number of samples is : 54263


In [38]:
# Concatenating the 3 datasets for labels preprocessing
df_all = pd.concat([df_train, df_val, df_test], axis=0).reset_index(drop=True)

# Preview of data
print(df_all.head(5))

                                                Text  anger  disgust  fear  \
0  My favourite food is anything I didn't have to...      0        0     0   
1  Now if he does off himself, everyone will thin...      0        0     0   
2                     WHY THE FUCK IS BAYLESS ISOING      1        0     0   
3                        To make her feel threatened      0        0     1   
4                             Dirty Southern Wankers      1        0     0   

   joy  sadness  surprise  neutral  
0    0        0         0        1  
1    0        0         0        1  
2    0        0         0        0  
3    0        0         0        0  
4    0        0         0        0  


In [39]:
print(df_all.shape)

(54263, 8)


In [40]:
# Applying the preprocessing function on the dataset
df_all["Clean_text"] = df_all["Text"].apply(preprocess_corpus)

# Preview of data
print(df_all[['Text', 'Clean_text']].sample(5))

                                                    Text  \
34480                         Whoa this is really creepy   
15157  Ok, I will take this with a pinch of salt, but...   
28594  I love bloodborne, I started my second play th...   
20091  Oh my goodness, I'm so glad you were there too...   
14786  Yeah -- a woman who expects a guy to never eve...   

                                              Clean_text  
34480                         whoa this is really creepy  
15157  ok i will take this with a pinch of salt but t...  
28594  i love bloodborne i started my second play thr...  
20091  oh my goodness i am so glad you were there too...  
14786  yeah a woman who expects a guy to never even t...  


In [41]:
# Keeping only necessary columns
#df_all = df_all.drop(['Class','List of classes','Len of classes','Emotions'], axis=1)
df_all.head(3)

Unnamed: 0,Text,anger,disgust,fear,joy,sadness,surprise,neutral,Clean_text
0,My favourite food is anything I didn't have to...,0,0,0,0,0,0,1,my favourite food is anything i did not have t...
1,"Now if he does off himself, everyone will thin...",0,0,0,0,0,0,1,now if he does off himself everyone will think...
2,WHY THE FUCK IS BAYLESS ISOING,1,0,0,0,0,0,0,why the fuck is bayless isoing


In [42]:
# Dropping raw text column
df_all = df_all[ ['Clean_text','anger','disgust','fear','joy','sadness','surprise','neutral'] ]
df_all

Unnamed: 0,Clean_text,anger,disgust,fear,joy,sadness,surprise,neutral
0,my favourite food is anything i did not have t...,0,0,0,0,0,0,1
1,now if he does off himself everyone will think...,0,0,0,0,0,0,1
2,why the fuck is bayless isoing,1,0,0,0,0,0,0
3,to make her feel threatened,0,0,1,0,0,0,0
4,dirty southern wankers,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
54258,thanks i was diagnosed with bp after the hospi...,0,0,0,1,0,0,0
54259,well that makes sense,0,0,0,1,0,0,0
54260,daddy issues name,0,0,0,0,0,0,1
54261,so glad i discovered that subreddit a couple m...,0,0,0,1,0,0,0


In [43]:
emotion_dict={
"anger": 0,
"disgust": 1,
"fear": 2,
"joy": 3,
"sadness": 4,
"surprise": 5,
"neutral":6
}

# Dropping Mapped Emotions column

In [44]:
# Building a function that will divide in train, validation and test sets
def get_train_val_test(df):
    train = df.iloc[:size_train, :]
    val = df.iloc[size_train:size_train+size_val, :]
    test = df.iloc[size_train+size_val:size_train+size_val+size_test, :]
    return train, val, test

In [45]:
# Dividing back in train, validation and test datasets (GoEmotions)
train_ekman, val_ekman, test_ekman = get_train_val_test(df_all)
print(train_ekman.shape)
print(val_ekman.shape)
print(test_ekman.shape)

(43410, 8)
(5426, 8)
(5427, 8)


In [46]:
train_ekman.head(3)

Unnamed: 0,Clean_text,anger,disgust,fear,joy,sadness,surprise,neutral
0,my favourite food is anything i did not have t...,0,0,0,0,0,0,1
1,now if he does off himself everyone will think...,0,0,0,0,0,0,1
2,why the fuck is bayless isoing,1,0,0,0,0,0,0


In [47]:
train_ekman_no_neu = train_ekman.copy()
val_ekman_no_neu = val_ekman.copy()
test_ekman_no_neu = test_ekman.copy()

train_ekman_no_neu.reset_index(inplace = True)
val_ekman_no_neu.reset_index(inplace = True)
test_ekman_no_neu.reset_index(inplace = True)

train_ekman_no_neu = train_ekman_no_neu.drop('neutral', axis=1)
val_ekman_no_neu = val_ekman_no_neu.drop('neutral', axis=1)
test_ekman_no_neu = test_ekman_no_neu.drop('neutral', axis=1)

train_ekman_no_neu = train_ekman_no_neu.drop(columns=['index'])
val_ekman_no_neu = val_ekman_no_neu.drop(columns=['index'])
test_ekman_no_neu = test_ekman_no_neu.drop(columns=['index'])

# Removing samples with only 0 in their labels
train_ekman_no_neu = train_ekman_no_neu.loc[ train_ekman_no_neu.apply(lambda x: sum(x[1:]), axis=1)>0 ]
val_ekman_no_neu = val_ekman_no_neu.loc[ val_ekman_no_neu.apply(lambda x: sum(x[1:]), axis=1)>0 ]
test_ekman_no_neu = test_ekman_no_neu.loc[ test_ekman_no_neu.apply(lambda x: sum(x[1:]), axis=1)>0 ]

# Shape validation
print(train_ekman_no_neu.shape)
print(val_ekman_no_neu.shape)
print(test_ekman_no_neu.shape)

(30587, 7)
(3834, 7)
(3821, 7)


In [48]:
train_ekman_no_neu.head(3)

Unnamed: 0,Clean_text,anger,disgust,fear,joy,sadness,surprise
2,why the fuck is bayless isoing,1,0,0,0,0,0
3,to make her feel threatened,0,0,1,0,0,0
4,dirty southern wankers,1,0,0,0,0,0


In [49]:
class_label_names_no_neu = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
class_label_names = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']

In [50]:
emotion_label_dict={
0:"anger",
1:"disgust",
2:"fear",
3:"joy",
4:"sadness",
5:"surprise",
6:"neutral"
}

# Loading emotion labels for GoEmotions taxonomy
with open("emotions.txt", "r") as file:
    GE_taxonomy = file.read().split("\n")


EKMAN_taxonomy = ['anger',
 'disgust',
 'fear',
 'joy',
 'sadness',
 'surprise',
 'neutral']


EKMAN_taxonomy_no_neu = ['anger',
 'disgust',
 'fear',
 'joy',
 'sadness',
 'surprise']

In [51]:
X_train_ekman = train_ekman[:]["Clean_text"]
y_train_ekman = train_ekman.loc[:, EKMAN_taxonomy].values.astype(float)
X_train_ekman_no_neu = train_ekman_no_neu[:]["Clean_text"]
y_train_ekman_no_neu = train_ekman_no_neu.loc[:, EKMAN_taxonomy_no_neu].values.astype(float)
X_val_ekman = val_ekman[:]["Clean_text"]
y_val_ekman = val_ekman.loc[:, EKMAN_taxonomy].values.astype(float)
X_val_ekman_no_neu = val_ekman_no_neu[:]["Clean_text"]
y_val_ekman_no_neu = val_ekman_no_neu.loc[:, EKMAN_taxonomy_no_neu].values.astype(float)
X_test_ekman = test_ekman[:]["Clean_text"]
y_test_ekman = test_ekman.loc[:, EKMAN_taxonomy].values.astype(float)
X_test_ekman_no_neu = test_ekman_no_neu[:]["Clean_text"]
y_test_ekman_no_neu = test_ekman_no_neu.loc[:, EKMAN_taxonomy_no_neu].values.astype(float)
print(X_train_ekman.shape, y_train_ekman.shape,y_train_ekman_no_neu.shape, 
      X_val_ekman.shape, y_val_ekman.shape,y_val_ekman_no_neu.shape,
      X_test_ekman.shape, y_test_ekman.shape, y_test_ekman_no_neu.shape)

(43410,) (43410, 7) (30587, 6) (5426,) (5426, 7) (3834, 6) (5427,) (5427, 7) (3821, 6)


# 2 - Modeling : BERT (Bidirectional Encoder Representations from Transformers)

Now we can go ahead and start defining our BERT-based model.

## 2.1 - Configuration of the base model

First of all, let's define a `max_length` variable. This variable sets a fixed length of sequences to be fed to our model. Therefore, sequences will be either truncated if larger than this value, or completed using padding if smaller. To avoid truncating, we fix this value according to the largest sample of our data.

In [52]:
# Computing max length of samples
full_text = pd.concat([train_GE_Sampled['text'], val_GE_Sampled['text'], test_GE_Sampled['text']])
max_length = full_text.apply(lambda x: len(x.split())).max()
max_length

48

We are going to use BERT's base model which contains almost 110 M trainable parameters. 

Also, in order to match the tokenization and vocabulary used during the training, we are going to use a BERT tokenizer.

# Importing BERT pre-trained model and tokenizer
model_name = 'bert-base-uncased'
config = BertConfig.from_pretrained(model_name, output_hidden_states=False)
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

!pip install GPUtil

!pip install numba

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()

### Model 1 - Go Emotion - 28 labels including  Neutral emotion

In [52]:
MAX_LEN = max_length
TRAIN_BATCH_SIZE = 16
VALIDATION_BATCH_SIZE = 16
TEST_BATCH_SIZE =16
EPOCHS = 4
LEARNING_RATE = 3e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

### Dataset & Data Loaders

In [53]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe[target_cols]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            verbose = True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [54]:
train_dataset = CustomDataset(
  sampled_train_df,
  tokenizer,
  max_len=MAX_LEN
)

validation_dataset = CustomDataset(
  val_GE_Sampled,
  tokenizer,
  max_len=MAX_LEN
)

test_dataset = CustomDataset(
  test_GE_Sampled,
  tokenizer,
  max_len=MAX_LEN
)

In [55]:
test_GE_Sampled.columns

Index(['text', 'admiration', 'amusement', 'anger', 'annoyance', 'approval',
       'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
       'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
       'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [56]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }
validation_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

train_loader = DataLoader(train_dataset, **train_params)
validation_loader = DataLoader(validation_dataset, **validation_params)
test_loader = DataLoader(test_dataset, **test_params)

In [57]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Model

In [58]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 28)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output)
        output = self.l3(output)
        return output

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [59]:
import random
import time

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

In [60]:
def train(epoch):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')
        
        # Update parameters and the learning rate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

!pip install GPUtil

### Start Training

In [61]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 1, Loss:  0.7084058523178101
Epoch: 2, Loss:  0.08420690894126892
Epoch: 3, Loss:  0.07675381004810333
Epoch: 4, Loss:  0.04872593656182289


In [62]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [63]:
def validation(loader,model):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

 for epoch 5 test - lr: 3e-05
#for epoch in range(EPOCHS):
y_pred_proba, targets = validation(1) # epoch
outputs = np.array(y_pred_proba) >= 0.3
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

#### Results for epoch=4, lr=3e-05

In [65]:
# train results
#for epoch in range(EPOCHS):
y_pred_proba_train_GE, targets_train = validation(train_loader,model) # epoch
y_pred_train_GE = np.array(y_pred_proba_train_GE) >= 0.5
accuracy = metrics.accuracy_score(targets_train, y_pred_train_GE)
f1_score_micro = metrics.f1_score(targets_train, y_pred_train_GE, average='micro')
f1_score_macro = metrics.f1_score(targets_train, y_pred_train_GE, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.7679106196728864
F1 Score (Micro) = 0.8597723774722802
F1 Score (Macro) = 0.7554309800573983


In [70]:
cr = classification_report(targets_train,y_pred_train_GE,target_names= GE_taxonomy)
print(cr)

                precision    recall  f1-score   support

    admiration       0.94      0.90      0.92      4130
     amusement       0.89      0.94      0.92      2328
         anger       0.86      0.83      0.85      1567
     annoyance       0.85      0.65      0.74      2470
      approval       0.93      0.69      0.79      2939
        caring       0.90      0.78      0.83      1087
     confusion       0.86      0.66      0.75      1368
     curiosity       0.89      0.80      0.84      2191
        desire       0.88      0.75      0.81       641
disappointment       0.89      0.56      0.69      1269
   disapproval       0.94      0.74      0.82      2022
       disgust       0.91      0.67      0.77       793
 embarrassment       0.96      0.63      0.76       303
    excitement       0.83      0.74      0.78       853
          fear       0.93      0.84      0.88       596
     gratitude       0.97      0.93      0.95      2662
         grief       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [71]:
# Model evaluation
#model_eval(targets_train, y_pred_train_GE, GE_taxonomy)

In [76]:
# test results
#for epoch in range(EPOCHS):
y_pred_proba_test_GE, targets_test = validation(test_loader,model) # epoch
y_pred_test_GE = np.array(y_pred_proba_test_GE) >= 0.5
accuracy = metrics.accuracy_score(targets_test, y_pred_test_GE)
f1_score_micro = metrics.f1_score(targets_test, y_pred_test_GE, average='micro')
f1_score_macro = metrics.f1_score(targets_test, y_pred_test_GE, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.46434494195688225
F1 Score (Micro) = 0.5687401986408782
F1 Score (Macro) = 0.4598725937206782


In [77]:
# Model evaluation
model_eval(y_test, y_pred_test_GE, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.69,0.65,0.67
amusement,0.77,0.88,0.82
anger,0.57,0.39,0.46
annoyance,0.38,0.28,0.32
approval,0.42,0.29,0.34
caring,0.49,0.36,0.42
confusion,0.44,0.36,0.4
curiosity,0.54,0.42,0.48
desire,0.55,0.35,0.43
disappointment,0.41,0.16,0.23


In [78]:
# validation results
#for epoch in range(EPOCHS):
y_pred_proba_val_GE, targets_val = validation(validation_loader,model) # epoch
y_pred_val_GE = np.array(y_pred_proba_val_GE) >= 0.5
accuracy = metrics.accuracy_score(targets_val, y_pred_val_GE)
f1_score_micro = metrics.f1_score(targets_val, y_pred_val_GE, average='micro')
f1_score_macro = metrics.f1_score(targets_val, y_pred_val_GE, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.45705860670844084
F1 Score (Micro) = 0.5666897028334485
F1 Score (Macro) = 0.4581808360109007


In [79]:
# Model evaluation
model_eval(y_val, y_pred_val_GE, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.73,0.7,0.71
amusement,0.76,0.83,0.79
anger,0.55,0.43,0.48
annoyance,0.39,0.33,0.36
approval,0.4,0.24,0.3
caring,0.5,0.33,0.4
confusion,0.48,0.36,0.41
curiosity,0.52,0.39,0.45
desire,0.49,0.48,0.49
disappointment,0.42,0.15,0.22


In [80]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_labels .shape[0]):
        for j in range(y_pred_labels .shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [81]:
# Generate labels for train, test and val
y_pred_train_labels = proba_to_labels(y_pred_proba_train_GE)

y_pred_test_labels = proba_to_labels(y_pred_proba_test_GE)

y_pred_val_labels = proba_to_labels(y_pred_proba_val_GE)

y_pred_labels = np.zeros_like(y_pred_proba)
y_pred_labels.shape[0]

In [82]:
# Function that computes labels from probabilities and optimizes the threshold that maximizes f1-score
def proba_to_labels_opt(y_true, y_pred_proba):
    
    '''
    Inputs:
        y_true: Ground truth labels
        y_pred_proba: predicted probabilities
        
    Outputs :
        best_y_pred_labels: preticted labels associated with best threshold
        best_t: best threshold
        best_macro_f1: macro f1-score associated with predicted labels
    '''
    
    # range of possible thresholds
    thresholds = np.arange(0.1, 0.99, 0.01)
    
    # Computing threshold that maximizes macro f1-score 
    best_y_pred_labels = np.zeros_like(y_pred_proba)
    best_t = 0
    best_macro_f1 = 0
    
    # Iterating through possible thresholds
    for t in thresholds:
        
        y_pred_labels = proba_to_labels(y_pred_proba, t)
                             
        _, _, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
        
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_t = t
            best_y_pred_labels = y_pred_labels
            
    return best_y_pred_labels, best_t, best_macro_f1

In [84]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_test_GE_opt, threshold_test_GE_opt, macro_f1_test_GE_opt = proba_to_labels_opt(y_test, y_pred_proba_test_GE)
print("The model's threshold is {}".format(threshold_test_GE_opt))
print("The model's best macro-f1 is {}".format(macro_f1_test_GE_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.32999999999999985
The model's best macro-f1 is 0.49506046922925356


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [85]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_val_GE_opt, threshold_val_GE_opt, macro_f1_val_GE_opt = proba_to_labels_opt(y_val, y_pred_proba_val_GE)
print("The model's threshold is {}".format(threshold_val_GE_opt))
print("The model's best macro-f1 is {}".format(macro_f1_val_GE_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.23999999999999994
The model's best macro-f1 is 0.48601307151663803


In [87]:
# Model evaluation
model_eval(y_test, y_pred_labels_test_GE_opt, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.65,0.71,0.68
amusement,0.74,0.91,0.82
anger,0.5,0.45,0.48
annoyance,0.34,0.4,0.37
approval,0.36,0.38,0.37
caring,0.42,0.43,0.42
confusion,0.35,0.48,0.41
curiosity,0.5,0.59,0.54
desire,0.46,0.41,0.43
disappointment,0.36,0.26,0.31


In [88]:
# Model evaluation
model_eval(y_val, y_pred_labels_val_GE_opt, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.63,0.79,0.7
amusement,0.7,0.89,0.78
anger,0.46,0.55,0.5
annoyance,0.3,0.5,0.38
approval,0.33,0.4,0.36
caring,0.39,0.49,0.44
confusion,0.32,0.5,0.39
curiosity,0.47,0.68,0.55
desire,0.38,0.58,0.46
disappointment,0.3,0.29,0.29


In [94]:
# Number of predictions with no positive label for test
sum(np.sum(y_pred_labels_test_GE_opt, axis=1)==0)

126

In [95]:
# Number of predictions with no positive label for val
sum(np.sum(y_pred_labels_val_GE_opt, axis=1)==0)

37

In [89]:
# Handling empty predictions for test
y_pred_labels_test_GE_opt_h = np.copy(y_pred_labels_test_GE_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_test_GE_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_test_GE_opt_h[i])]=1

# Evaluation
model_eval(y_test, y_pred_labels_test_GE_opt_h, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.54,0.73,0.62
amusement,0.74,0.91,0.82
anger,0.5,0.45,0.48
annoyance,0.34,0.4,0.37
approval,0.36,0.38,0.37
caring,0.42,0.43,0.42
confusion,0.35,0.48,0.41
curiosity,0.5,0.59,0.54
desire,0.46,0.41,0.43
disappointment,0.36,0.26,0.31


In [90]:
# Handling empty predictions for val
y_pred_labels_val_GE_opt_h = np.copy(y_pred_labels_val_GE_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_val_GE_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_val_GE_opt_h[i])]=1

# Evaluation
model_eval(y_val, y_pred_labels_val_GE_opt_h, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.6,0.8,0.68
amusement,0.7,0.89,0.78
anger,0.46,0.55,0.5
annoyance,0.3,0.5,0.38
approval,0.33,0.4,0.36
caring,0.39,0.49,0.44
confusion,0.32,0.5,0.39
curiosity,0.47,0.68,0.55
desire,0.38,0.58,0.46
disappointment,0.3,0.29,0.29


In [91]:
# Handling empty predictions
y_pred_labels_test_GE_opt_n = np.copy(y_pred_labels_test_GE_opt)

# if no predictions ==> neutral
for pred in y_pred_labels_test_GE_opt_n:
    if pred.sum()==0:
        pred[-1]=1

# Evaluation
model_eval(y_test, y_pred_labels_test_GE_opt_n, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.65,0.71,0.68
amusement,0.74,0.91,0.82
anger,0.5,0.45,0.48
annoyance,0.34,0.4,0.37
approval,0.36,0.38,0.37
caring,0.42,0.43,0.42
confusion,0.35,0.48,0.41
curiosity,0.5,0.59,0.54
desire,0.46,0.41,0.43
disappointment,0.36,0.26,0.31


In [93]:
# Handling empty predictions
y_pred_labels_val_GE_opt_n = np.copy(y_pred_labels_val_GE_opt)

# if no predictions ==> neutral
for pred in y_pred_labels_val_GE_opt_n:
    if pred.sum()==0:
        pred[-1]=1

# Evaluation
model_eval(y_val, y_pred_labels_val_GE_opt_n, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.63,0.79,0.7
amusement,0.7,0.89,0.78
anger,0.46,0.55,0.5
annoyance,0.3,0.5,0.38
approval,0.33,0.4,0.36
caring,0.39,0.49,0.44
confusion,0.32,0.5,0.39
curiosity,0.47,0.68,0.55
desire,0.38,0.58,0.46
disappointment,0.3,0.29,0.29


### Saving the Model

In [97]:
PATH = "BERT_GoEmotion_1.pt"
torch.save(model.state_dict(), PATH)


#### Loading the model

In [98]:
PATH = "BERT_GoEmotion_1.pt"
model_GE_1 = BERTClass()
model_GE_1.load_state_dict(torch.load(PATH))
model_GE_1.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

### Model 2 - Go Emotion - 28 labels including  Neutral emotion

In [52]:
MAX_LEN = max_length
TRAIN_BATCH_SIZE = 8
VALIDATION_BATCH_SIZE = 8
TEST_BATCH_SIZE =8
EPOCHS = 4
LEARNING_RATE = 3e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

### Dataset & Data Loaders

In [53]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe[target_cols]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            verbose = True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [54]:
train_dataset = CustomDataset(
  sampled_train_df,
  tokenizer,
  max_len=MAX_LEN
)

validation_dataset = CustomDataset(
  val_GE_Sampled,
  tokenizer,
  max_len=MAX_LEN
)

test_dataset = CustomDataset(
  test_GE_Sampled,
  tokenizer,
  max_len=MAX_LEN
)

In [55]:
test_GE_Sampled.columns

Index(['text', 'admiration', 'amusement', 'anger', 'annoyance', 'approval',
       'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
       'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
       'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [56]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }
validation_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

train_loader = DataLoader(train_dataset, **train_params)
validation_loader = DataLoader(validation_dataset, **validation_params)
test_loader = DataLoader(test_dataset, **test_params)

In [57]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Model

In [58]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 28)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output)
        output = self.l3(output)
        return output

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [59]:
import random
import time

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

In [60]:
def train(epoch):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')
        
        # Update parameters and the learning rate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

!pip install GPUtil

### Start Training

In [61]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 1, Loss:  0.7128723859786987
Epoch: 1, Loss:  0.10321096330881119
Epoch: 2, Loss:  0.0785074457526207
Epoch: 2, Loss:  0.09882600605487823
Epoch: 3, Loss:  0.07843275368213654
Epoch: 3, Loss:  0.09239786118268967
Epoch: 4, Loss:  0.043027009814977646
Epoch: 4, Loss:  0.04643067345023155


In [62]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [63]:
def validation(loader,model):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

 for epoch 5 test - lr: 3e-05
#for epoch in range(EPOCHS):
y_pred_proba, targets = validation(1) # epoch
outputs = np.array(y_pred_proba) >= 0.3
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

#### Results for epoch=4, lr=3e-05

In [64]:
# train results
#for epoch in range(EPOCHS):
y_pred_proba_train_GE, targets_train = validation(train_loader,model) # epoch
y_pred_train_GE = np.array(y_pred_proba_train_GE) >= 0.5
accuracy = metrics.accuracy_score(targets_train, y_pred_train_GE)
f1_score_micro = metrics.f1_score(targets_train, y_pred_train_GE, average='micro')
f1_score_macro = metrics.f1_score(targets_train, y_pred_train_GE, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.7791983413959918
F1 Score (Micro) = 0.8674778921991416
F1 Score (Macro) = 0.7710649399006966


In [65]:
cr = classification_report(targets_train,y_pred_train_GE,target_names= GE_taxonomy)
print(cr)

                precision    recall  f1-score   support

    admiration       0.95      0.89      0.92      4130
     amusement       0.92      0.91      0.92      2328
         anger       0.80      0.88      0.84      1567
     annoyance       0.81      0.75      0.78      2470
      approval       0.93      0.68      0.79      2939
        caring       0.92      0.77      0.84      1087
     confusion       0.92      0.69      0.78      1368
     curiosity       0.87      0.85      0.86      2191
        desire       0.86      0.77      0.81       641
disappointment       0.87      0.66      0.75      1269
   disapproval       0.94      0.75      0.84      2022
       disgust       0.95      0.67      0.78       793
 embarrassment       0.96      0.63      0.76       303
    excitement       0.85      0.70      0.77       853
          fear       0.96      0.85      0.90       596
     gratitude       0.97      0.94      0.95      2662
         grief       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


###### Model evaluation
model_eval(y_train, y_pred_train_GE, GE_taxonomy)

In [66]:
# test results
#for epoch in range(EPOCHS):
y_pred_proba_test_GE, targets_test = validation(test_loader,model) # epoch
y_pred_test_GE = np.array(y_pred_proba_test_GE) >= 0.5
accuracy = metrics.accuracy_score(targets_test, y_pred_test_GE)
f1_score_micro = metrics.f1_score(targets_test, y_pred_test_GE, average='micro')
f1_score_macro = metrics.f1_score(targets_test, y_pred_test_GE, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.45992260917634054
F1 Score (Micro) = 0.5833872010342598
F1 Score (Macro) = 0.48577258367220083


In [67]:
# Model evaluation
model_eval(y_test, y_pred_test_GE, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.7,0.67,0.68
amusement,0.76,0.87,0.81
anger,0.49,0.51,0.5
annoyance,0.31,0.37,0.33
approval,0.45,0.33,0.38
caring,0.44,0.36,0.4
confusion,0.41,0.41,0.41
curiosity,0.48,0.56,0.52
desire,0.58,0.46,0.51
disappointment,0.38,0.29,0.33


In [68]:
# validation results
#for epoch in range(EPOCHS):
y_pred_proba_val_GE, targets = validation(validation_loader,model) # epoch
y_pred_val_GE = np.array(y_pred_proba_val_GE) >= 0.5
accuracy = metrics.accuracy_score(targets, y_pred_val_GE)
f1_score_micro = metrics.f1_score(targets, y_pred_val_GE, average='micro')
f1_score_macro = metrics.f1_score(targets, y_pred_val_GE, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.4574272023590122
F1 Score (Micro) = 0.5656271418779987
F1 Score (Macro) = 0.46466856086553865


In [69]:
# Model evaluation
model_eval(y_val, y_pred_val_GE, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.72,0.68,0.7
amusement,0.79,0.8,0.79
anger,0.47,0.48,0.48
annoyance,0.34,0.39,0.37
approval,0.46,0.23,0.31
caring,0.55,0.31,0.39
confusion,0.51,0.33,0.4
curiosity,0.49,0.44,0.46
desire,0.55,0.51,0.53
disappointment,0.38,0.18,0.25


In [70]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_labels .shape[0]):
        for j in range(y_pred_labels .shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [71]:
# Generate labels for train, test and val
y_pred_train_labels = proba_to_labels(y_pred_proba_train_GE)

y_pred_test_labels = proba_to_labels(y_pred_proba_test_GE)

y_pred_val_labels = proba_to_labels(y_pred_proba_val_GE)

y_pred_labels = np.zeros_like(y_pred_proba)
y_pred_labels.shape[0]

In [72]:
# Function that computes labels from probabilities and optimizes the threshold that maximizes f1-score
def proba_to_labels_opt(y_true, y_pred_proba):
    
    '''
    Inputs:
        y_true: Ground truth labels
        y_pred_proba: predicted probabilities
        
    Outputs :
        best_y_pred_labels: preticted labels associated with best threshold
        best_t: best threshold
        best_macro_f1: macro f1-score associated with predicted labels
    '''
    
    # range of possible thresholds
    thresholds = np.arange(0.1, 0.99, 0.01)
    
    # Computing threshold that maximizes macro f1-score 
    best_y_pred_labels = np.zeros_like(y_pred_proba)
    best_t = 0
    best_macro_f1 = 0
    
    # Iterating through possible thresholds
    for t in thresholds:
        
        y_pred_labels = proba_to_labels(y_pred_proba, t)
                             
        _, _, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
        
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_t = t
            best_y_pred_labels = y_pred_labels
            
    return best_y_pred_labels, best_t, best_macro_f1

In [73]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_test_GE_opt, threshold_test_GE_opt, macro_f1_test_GE_opt = proba_to_labels_opt(y_test, y_pred_proba_test_GE)
print("The model's threshold is {}".format(threshold_test_GE_opt))
print("The model's best macro-f1 is {}".format(macro_f1_test_GE_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.23999999999999994
The model's best macro-f1 is 0.4956210149387412


  _warn_prf(average, modifier, msg_start, len(result))


In [74]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_val_GE_opt, threshold_val_GE_opt, macro_f1_val_GE_opt = proba_to_labels_opt(y_val, y_pred_proba_val_GE)
print("The model's threshold is {}".format(threshold_val_GE_opt))
print("The model's best macro-f1 is {}".format(macro_f1_val_GE_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.2599999999999999
The model's best macro-f1 is 0.4901004643419561


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
# Model evaluation
model_eval(y_test, y_pred_labels_test_GE_opt, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.65,0.73,0.68
amusement,0.74,0.92,0.82
anger,0.41,0.62,0.49
annoyance,0.26,0.51,0.34
approval,0.38,0.41,0.39
caring,0.36,0.44,0.39
confusion,0.31,0.55,0.4
curiosity,0.46,0.74,0.56
desire,0.48,0.51,0.49
disappointment,0.28,0.35,0.31


In [76]:
# Model evaluation
model_eval(y_val, y_pred_labels_val_GE_opt, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.65,0.78,0.71
amusement,0.74,0.86,0.79
anger,0.38,0.63,0.47
annoyance,0.27,0.57,0.37
approval,0.35,0.32,0.34
caring,0.43,0.45,0.44
confusion,0.35,0.47,0.41
curiosity,0.43,0.66,0.52
desire,0.44,0.57,0.49
disappointment,0.3,0.35,0.32


In [77]:
# Number of predictions with no positive label for test
sum(np.sum(y_pred_labels_test_GE_opt, axis=1)==0)

20

In [78]:
# Number of predictions with no positive label for val
sum(np.sum(y_pred_labels_val_GE_opt, axis=1)==0)

27

In [79]:
# Handling empty predictions for test
y_pred_labels_test_GE_opt_h = np.copy(y_pred_labels_test_GE_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_test_GE_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_test_GE_opt_h[i])]=1

# Evaluation
model_eval(y_test, y_pred_labels_test_GE_opt_h, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.63,0.73,0.68
amusement,0.74,0.92,0.82
anger,0.41,0.62,0.49
annoyance,0.26,0.51,0.34
approval,0.38,0.41,0.39
caring,0.36,0.44,0.39
confusion,0.31,0.55,0.4
curiosity,0.46,0.74,0.56
desire,0.48,0.51,0.49
disappointment,0.28,0.35,0.31


In [80]:
# Handling empty predictions for val
y_pred_labels_val_GE_opt_h = np.copy(y_pred_labels_val_GE_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_val_GE_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_val_GE_opt_h[i])]=1

# Evaluation
model_eval(y_val, y_pred_labels_val_GE_opt_h, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.63,0.79,0.7
amusement,0.74,0.86,0.79
anger,0.38,0.63,0.47
annoyance,0.27,0.57,0.37
approval,0.35,0.32,0.34
caring,0.43,0.45,0.44
confusion,0.35,0.47,0.41
curiosity,0.43,0.66,0.52
desire,0.44,0.57,0.49
disappointment,0.3,0.35,0.32


In [81]:
# Handling empty predictions
y_pred_labels_test_GE_opt_n = np.copy(y_pred_labels_test_GE_opt)

# if no predictions ==> neutral
for pred in y_pred_labels_test_GE_opt_n:
    if pred.sum()==0:
        pred[-1]=1

# Evaluation
model_eval(y_test, y_pred_labels_test_GE_opt_n, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.65,0.73,0.68
amusement,0.74,0.92,0.82
anger,0.41,0.62,0.49
annoyance,0.26,0.51,0.34
approval,0.38,0.41,0.39
caring,0.36,0.44,0.39
confusion,0.31,0.55,0.4
curiosity,0.46,0.74,0.56
desire,0.48,0.51,0.49
disappointment,0.28,0.35,0.31


In [82]:
# Handling empty predictions
y_pred_labels_val_GE_opt_n = np.copy(y_pred_labels_val_GE_opt)

# if no predictions ==> neutral
for pred in y_pred_labels_val_GE_opt_n:
    if pred.sum()==0:
        pred[-1]=1

# Evaluation
model_eval(y_val, y_pred_labels_val_GE_opt_n, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.65,0.78,0.71
amusement,0.74,0.86,0.79
anger,0.38,0.63,0.47
annoyance,0.27,0.57,0.37
approval,0.35,0.32,0.34
caring,0.43,0.45,0.44
confusion,0.35,0.47,0.41
curiosity,0.43,0.66,0.52
desire,0.44,0.57,0.49
disappointment,0.3,0.35,0.32


### Saving the Model

In [83]:
PATH = "BERT_GoEmotion_2.pt"
torch.save(model.state_dict(), PATH)


#### Loading the model

In [84]:
PATH = "BERT_GoEmotion_2.pt"
model_GE_2 = BERTClass()
model_GE_2.load_state_dict(torch.load(PATH))
model_GE_2.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

### Model 3 - Go Emotion - 28 labels including  Neutral emotion



### Final

In [52]:
MAX_LEN = max_length
TRAIN_BATCH_SIZE = 32
VALIDATION_BATCH_SIZE = 32
TEST_BATCH_SIZE =32
EPOCHS = 4
LEARNING_RATE = 3e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

### Dataset & Data Loaders

In [53]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe[target_cols]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            verbose = True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [54]:
train_dataset = CustomDataset(
  sampled_train_df,
  tokenizer,
  max_len=MAX_LEN
)

validation_dataset = CustomDataset(
  val_GE_Sampled,
  tokenizer,
  max_len=MAX_LEN
)

test_dataset = CustomDataset(
  test_GE_Sampled,
  tokenizer,
  max_len=MAX_LEN
)

In [55]:
test_GE_Sampled.columns

Index(['text', 'admiration', 'amusement', 'anger', 'annoyance', 'approval',
       'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
       'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
       'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [56]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }
validation_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

train_loader = DataLoader(train_dataset, **train_params)
validation_loader = DataLoader(validation_dataset, **validation_params)
test_loader = DataLoader(test_dataset, **test_params)

In [57]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Model

In [58]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 28)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output)
        output = self.l3(output)
        return output

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [59]:
import random
import time

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

In [60]:
def train(epoch):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')
        
        # Update parameters and the learning rate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

!pip install GPUtil

### Start Training

In [61]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 1, Loss:  0.7022333741188049
Epoch: 2, Loss:  0.09739448130130768
Epoch: 3, Loss:  0.10452917963266373
Epoch: 4, Loss:  0.045089077204465866


In [62]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [63]:
def validation(loader,model):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

 for epoch 5 test - lr: 3e-05
#for epoch in range(EPOCHS):
y_pred_proba, targets = validation(1) # epoch
outputs = np.array(y_pred_proba) >= 0.3
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

#### Results for epoch=4, lr=3e-05

In [64]:
# train results
#for epoch in range(EPOCHS):
y_pred_proba_train_GE, targets_train = validation(train_loader,model) # epoch
y_pred_train_GE = np.array(y_pred_proba_train_GE) >= 0.5
accuracy = metrics.accuracy_score(targets_train, y_pred_train_GE)
f1_score_micro = metrics.f1_score(targets_train, y_pred_train_GE, average='micro')
f1_score_macro = metrics.f1_score(targets_train, y_pred_train_GE, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.7290486063119097
F1 Score (Micro) = 0.8292971181951524
F1 Score (Macro) = 0.6990493214980976


In [65]:
cr = classification_report(targets_train,y_pred_train_GE,target_names= GE_taxonomy)
print(cr)

                precision    recall  f1-score   support

    admiration       0.90      0.92      0.91      4130
     amusement       0.88      0.93      0.91      2328
         anger       0.82      0.78      0.80      1567
     annoyance       0.78      0.62      0.69      2470
      approval       0.90      0.63      0.74      2939
        caring       0.89      0.71      0.79      1087
     confusion       0.87      0.57      0.69      1368
     curiosity       0.83      0.79      0.81      2191
        desire       0.79      0.76      0.77       641
disappointment       0.85      0.54      0.66      1269
   disapproval       0.82      0.76      0.79      2022
       disgust       0.87      0.65      0.74       793
 embarrassment       0.88      0.62      0.73       303
    excitement       0.80      0.66      0.73       853
          fear       0.83      0.91      0.87       596
     gratitude       0.98      0.92      0.95      2662
         grief       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


###### Model evaluation
model_eval(y_train, y_pred_train_GE, GE_taxonomy)

In [66]:
# test results
#for epoch in range(EPOCHS):
y_pred_proba_test_GE, targets_test = validation(test_loader,model) # epoch
y_pred_test_GE = np.array(y_pred_proba_test_GE) >= 0.5
accuracy = metrics.accuracy_score(targets_test, y_pred_test_GE)
f1_score_micro = metrics.f1_score(targets_test, y_pred_test_GE, average='micro')
f1_score_macro = metrics.f1_score(targets_test, y_pred_test_GE, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.4617652478348996
F1 Score (Micro) = 0.569127046668418
F1 Score (Macro) = 0.45893365946320247


In [67]:
# Model evaluation
model_eval(y_test, y_pred_test_GE, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.65,0.7,0.67
amusement,0.79,0.86,0.82
anger,0.56,0.4,0.47
annoyance,0.34,0.29,0.31
approval,0.47,0.31,0.37
caring,0.53,0.3,0.38
confusion,0.5,0.31,0.39
curiosity,0.55,0.5,0.52
desire,0.58,0.46,0.51
disappointment,0.52,0.2,0.29


In [68]:
# validation results
#for epoch in range(EPOCHS):
y_pred_proba_val_GE, targets = validation(validation_loader,model) # epoch
y_pred_val_GE = np.array(y_pred_proba_val_GE) >= 0.5
accuracy = metrics.accuracy_score(targets, y_pred_val_GE)
f1_score_micro = metrics.f1_score(targets, y_pred_val_GE, average='micro')
f1_score_macro = metrics.f1_score(targets, y_pred_val_GE, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.463509030593439
F1 Score (Micro) = 0.5768057854840115
F1 Score (Macro) = 0.47348143166946505


In [69]:
# Model evaluation
model_eval(y_val, y_pred_val_GE, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.7,0.77,0.73
amusement,0.77,0.83,0.8
anger,0.55,0.41,0.47
annoyance,0.38,0.36,0.37
approval,0.57,0.28,0.38
caring,0.6,0.34,0.44
confusion,0.6,0.35,0.44
curiosity,0.49,0.4,0.44
desire,0.57,0.53,0.55
disappointment,0.42,0.17,0.24


In [70]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_labels .shape[0]):
        for j in range(y_pred_labels .shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [71]:
# Generate labels for train, test and val
y_pred_train_labels = proba_to_labels(y_pred_proba_train_GE)

y_pred_test_labels = proba_to_labels(y_pred_proba_test_GE)

y_pred_val_labels = proba_to_labels(y_pred_proba_val_GE)

y_pred_labels = np.zeros_like(y_pred_proba)
y_pred_labels.shape[0]

In [72]:
# Function that computes labels from probabilities and optimizes the threshold that maximizes f1-score
def proba_to_labels_opt(y_true, y_pred_proba):
    
    '''
    Inputs:
        y_true: Ground truth labels
        y_pred_proba: predicted probabilities
        
    Outputs :
        best_y_pred_labels: preticted labels associated with best threshold
        best_t: best threshold
        best_macro_f1: macro f1-score associated with predicted labels
    '''
    
    # range of possible thresholds
    thresholds = np.arange(0.1, 0.99, 0.01)
    
    # Computing threshold that maximizes macro f1-score 
    best_y_pred_labels = np.zeros_like(y_pred_proba)
    best_t = 0
    best_macro_f1 = 0
    
    # Iterating through possible thresholds
    for t in thresholds:
        
        y_pred_labels = proba_to_labels(y_pred_proba, t)
                             
        _, _, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
        
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_t = t
            best_y_pred_labels = y_pred_labels
            
    return best_y_pred_labels, best_t, best_macro_f1

In [73]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_test_GE_opt, threshold_test_GE_opt, macro_f1_test_GE_opt = proba_to_labels_opt(y_test, y_pred_proba_test_GE)
print("The model's threshold is {}".format(threshold_test_GE_opt))
print("The model's best macro-f1 is {}".format(macro_f1_test_GE_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.23999999999999994
The model's best macro-f1 is 0.49817440684799286


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [74]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_val_GE_opt, threshold_val_GE_opt, macro_f1_val_GE_opt = proba_to_labels_opt(y_val, y_pred_proba_val_GE)
print("The model's threshold is {}".format(threshold_val_GE_opt))
print("The model's best macro-f1 is {}".format(macro_f1_val_GE_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.24999999999999992
The model's best macro-f1 is 0.49759084076693305


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
# Model evaluation
model_eval(y_test, y_pred_labels_test_GE_opt, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.58,0.8,0.67
amusement,0.74,0.92,0.82
anger,0.44,0.54,0.49
annoyance,0.28,0.51,0.36
approval,0.33,0.45,0.38
caring,0.41,0.44,0.42
confusion,0.35,0.58,0.44
curiosity,0.47,0.77,0.59
desire,0.49,0.53,0.51
disappointment,0.33,0.33,0.33


In [76]:
# Model evaluation
model_eval(y_val, y_pred_labels_val_GE_opt, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.6,0.83,0.7
amusement,0.73,0.87,0.79
anger,0.42,0.55,0.48
annoyance,0.28,0.58,0.38
approval,0.38,0.4,0.39
caring,0.49,0.51,0.5
confusion,0.4,0.51,0.45
curiosity,0.45,0.71,0.55
desire,0.41,0.61,0.49
disappointment,0.29,0.31,0.3


In [77]:
# Number of predictions with no positive label for test
sum(np.sum(y_pred_labels_test_GE_opt, axis=1)==0)

21

In [78]:
# Number of predictions with no positive label for val
sum(np.sum(y_pred_labels_val_GE_opt, axis=1)==0)

46

In [79]:
# Handling empty predictions for test
y_pred_labels_test_GE_opt_h = np.copy(y_pred_labels_test_GE_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_test_GE_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_test_GE_opt_h[i])]=1

# Evaluation
model_eval(y_test, y_pred_labels_test_GE_opt_h, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.57,0.81,0.66
amusement,0.74,0.92,0.82
anger,0.44,0.54,0.49
annoyance,0.28,0.51,0.36
approval,0.33,0.45,0.38
caring,0.41,0.44,0.42
confusion,0.35,0.58,0.44
curiosity,0.47,0.77,0.59
desire,0.49,0.53,0.51
disappointment,0.33,0.33,0.33


In [80]:
# Handling empty predictions for val
y_pred_labels_val_GE_opt_h = np.copy(y_pred_labels_val_GE_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_val_GE_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_val_GE_opt_h[i])]=1

# Evaluation
model_eval(y_val, y_pred_labels_val_GE_opt_h, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.57,0.83,0.67
amusement,0.73,0.87,0.79
anger,0.42,0.55,0.48
annoyance,0.28,0.58,0.38
approval,0.38,0.4,0.39
caring,0.49,0.51,0.5
confusion,0.4,0.51,0.45
curiosity,0.45,0.71,0.55
desire,0.41,0.61,0.49
disappointment,0.29,0.31,0.3


In [81]:
# Handling empty predictions
y_pred_labels_test_GE_opt_n = np.copy(y_pred_labels_test_GE_opt)

# if no predictions ==> neutral
for pred in y_pred_labels_test_GE_opt_n:
    if pred.sum()==0:
        pred[-1]=1

# Evaluation
model_eval(y_test, y_pred_labels_test_GE_opt_n, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.58,0.8,0.67
amusement,0.74,0.92,0.82
anger,0.44,0.54,0.49
annoyance,0.28,0.51,0.36
approval,0.33,0.45,0.38
caring,0.41,0.44,0.42
confusion,0.35,0.58,0.44
curiosity,0.47,0.77,0.59
desire,0.49,0.53,0.51
disappointment,0.33,0.33,0.33


In [82]:
# Handling empty predictions
y_pred_labels_val_GE_opt_n = np.copy(y_pred_labels_val_GE_opt)

# if no predictions ==> neutral
for pred in y_pred_labels_val_GE_opt_n:
    if pred.sum()==0:
        pred[-1]=1

# Evaluation
model_eval(y_val, y_pred_labels_val_GE_opt_n, GE_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.6,0.83,0.7
amusement,0.73,0.87,0.79
anger,0.42,0.55,0.48
annoyance,0.28,0.58,0.38
approval,0.38,0.4,0.39
caring,0.49,0.51,0.5
confusion,0.4,0.51,0.45
curiosity,0.45,0.71,0.55
desire,0.41,0.61,0.49
disappointment,0.29,0.31,0.3


### Saving the Model

In [83]:
PATH = "BERT_GoEmotion_3.pt"
torch.save(model.state_dict(), PATH)


#### Loading the model

In [84]:
PATH = "BERT_GoEmotion_3.pt"
model_GE_3 = BERTClass()
model_GE_3.load_state_dict(torch.load(PATH))
model_GE_3.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

df_train_GE_no_neu
df_val_GE_no_neu
df_test_GE_no_neu

### Goemotion taxonomy with 27 labels excluding neutral emotion

###  Model 1

In [106]:
MAX_LEN = max_length
TRAIN_BATCH_SIZE = 16
VALIDATION_BATCH_SIZE = 16
TEST_BATCH_SIZE =16
EPOCHS = 4
LEARNING_RATE = 3e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

### Dataset & Data Loaders

In [111]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe[GE_taxonomy_no_neu]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            verbose = True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [112]:
train_dataset = CustomDataset(
  df_train_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

validation_dataset = CustomDataset(
  df_val_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

test_dataset = CustomDataset(
  df_test_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

In [113]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }
validation_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

train_loader = DataLoader(train_dataset, **train_params)
validation_loader = DataLoader(validation_dataset, **validation_params)
test_loader = DataLoader(test_dataset, **test_params)

In [114]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Model

In [115]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 27)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output)
        output = self.l3(output)
        return output

model_GE_no_neu = BERTClass()
model_GE_no_neu.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [116]:
import random
import time

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params =  model_GE_no_neu.parameters(), lr=LEARNING_RATE)

In [117]:
def train(epoch,model):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')
        
        # Update parameters and the learning rate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

!pip install GPUtil

### Start Training

In [118]:
for epoch in range(EPOCHS):
    train(epoch, model_GE_no_neu)

Epoch: 1, Loss:  0.6878913640975952
Epoch: 2, Loss:  0.09186509251594543
Epoch: 3, Loss:  0.08754180371761322
Epoch: 4, Loss:  0.04824312403798103


In [120]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [121]:
def validation(loader,model):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [125]:
# train results
#for epoch in range(EPOCHS):
y_pred_proba_train_GE_no_neu, targets_train = validation(train_loader,model_GE_no_neu) # epoch
y_pred_train_GE_no_neu = np.array(y_pred_proba_train_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_train, y_pred_train_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.747376336352045
F1 Score (Micro) = 0.8612890258609138
F1 Score (Macro) = 0.7860408400750243


In [126]:
cr = classification_report(targets_train,y_pred_train_GE_no_neu,target_names= GE_taxonomy_no_neu)
print(cr)

                precision    recall  f1-score   support

    admiration       0.89      0.95      0.92      4130
     amusement       0.90      0.96      0.93      2328
         anger       0.73      0.93      0.82      1567
     annoyance       0.86      0.77      0.81      2470
      approval       0.95      0.81      0.88      2939
        caring       0.81      0.89      0.85      1087
     confusion       0.84      0.80      0.82      1368
     curiosity       0.79      0.96      0.87      2191
        desire       0.86      0.85      0.86       641
disappointment       0.81      0.73      0.76      1269
   disapproval       0.90      0.87      0.89      2022
       disgust       0.83      0.79      0.81       793
 embarrassment       0.89      0.72      0.80       303
    excitement       0.75      0.86      0.80       853
          fear       0.77      0.95      0.85       596
     gratitude       0.97      0.95      0.96      2662
         grief       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Model evaluation
model_eval(targets_train, y_pred_train_GE_no_neu, GE_taxonomy_no_neu)

In [150]:
# test results
#for epoch in range(EPOCHS):
y_pred_proba_test_GE_no_neu, targets_test = validation(test_loader,model_GE_no_neu) # epoch
y_pred_test_GE_no_neu = np.array(y_pred_proba_test_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_test, y_pred_test_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.49018581523161475
F1 Score (Micro) = 0.6167870249792826
F1 Score (Macro) = 0.5160462480296394


In [151]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_test_GE_no_neu, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.7,0.73,0.72
amusement,0.82,0.88,0.85
anger,0.53,0.51,0.52
annoyance,0.52,0.3,0.38
approval,0.61,0.41,0.49
caring,0.54,0.47,0.5
confusion,0.55,0.45,0.5
curiosity,0.68,0.73,0.7
desire,0.76,0.41,0.53
disappointment,0.46,0.25,0.32


In [152]:
# validation results
#for epoch in range(EPOCHS):
y_pred_proba_val_GE_no_neu, targets_val = validation(validation_loader,model_GE_no_neu) # epoch
y_pred_val_GE_no_neu = np.array(y_pred_proba_val_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_val, y_pred_val_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.48852373500260826
F1 Score (Micro) = 0.6112550035319049
F1 Score (Macro) = 0.5096072512246117


In [153]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_val_GE_no_neu, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.72,0.77,0.75
amusement,0.83,0.83,0.83
anger,0.58,0.52,0.55
annoyance,0.45,0.26,0.33
approval,0.66,0.34,0.45
caring,0.58,0.49,0.53
confusion,0.46,0.36,0.4
curiosity,0.63,0.69,0.66
desire,0.69,0.55,0.61
disappointment,0.51,0.28,0.36


In [135]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_labels .shape[0]):
        for j in range(y_pred_labels .shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [136]:
# Generate labels for train, test and val
y_pred_train_labels_no_neu = proba_to_labels(y_pred_proba_train_GE_no_neu)

y_pred_test_labels_no_neu = proba_to_labels(y_pred_proba_test_GE_no_neu)

y_pred_val_labels_no_neu = proba_to_labels(y_pred_proba_val_GE_no_neu)

y_pred_labels = np.zeros_like(y_pred_proba)
y_pred_labels.shape[0]

In [137]:
# Function that computes labels from probabilities and optimizes the threshold that maximizes f1-score
def proba_to_labels_opt(y_true, y_pred_proba):
    
    '''
    Inputs:
        y_true: Ground truth labels
        y_pred_proba: predicted probabilities
        
    Outputs :
        best_y_pred_labels: preticted labels associated with best threshold
        best_t: best threshold
        best_macro_f1: macro f1-score associated with predicted labels
    '''
    
    # range of possible thresholds
    thresholds = np.arange(0.1, 0.99, 0.01)
    
    # Computing threshold that maximizes macro f1-score 
    best_y_pred_labels = np.zeros_like(y_pred_proba)
    best_t = 0
    best_macro_f1 = 0
    
    # Iterating through possible thresholds
    for t in thresholds:
        
        y_pred_labels = proba_to_labels(y_pred_proba, t)
                             
        _, _, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
        
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_t = t
            best_y_pred_labels = y_pred_labels
            
    return best_y_pred_labels, best_t, best_macro_f1

In [139]:
##### Compute label predictions and corresponding optimal thresholds 
y_pred_labels_test_GE_no_neu_opt, threshold_test_GE_no_neu_opt, macro_f1_test_GE_no_neu_opt = proba_to_labels_opt(y_test_no_neu, y_pred_proba_test_GE_no_neu)
print("The model's threshold is {}".format(threshold_test_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_test_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

The model's threshold is 0.22999999999999995
The model's best macro-f1 is 0.5439786177818173


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [141]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_val_GE_no_neu_opt, threshold_val_GE_no_neu_opt, macro_f1_val_GE_no_neu_opt = proba_to_labels_opt(y_val_no_neu, y_pred_proba_val_GE_no_neu)
print("The model's threshold is {}".format(threshold_val_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_val_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.21999999999999995
The model's best macro-f1 is 0.5429963869122217


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [142]:
#####  Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.64,0.82,0.72
amusement,0.79,0.93,0.85
anger,0.46,0.63,0.54
annoyance,0.43,0.48,0.46
approval,0.49,0.5,0.5
caring,0.42,0.58,0.49
confusion,0.41,0.63,0.49
curiosity,0.61,0.83,0.71
desire,0.67,0.59,0.63
disappointment,0.32,0.38,0.35


In [143]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.65,0.84,0.73
amusement,0.78,0.87,0.82
anger,0.45,0.62,0.52
annoyance,0.41,0.54,0.47
approval,0.56,0.45,0.5
caring,0.49,0.65,0.56
confusion,0.39,0.55,0.45
curiosity,0.56,0.81,0.67
desire,0.52,0.62,0.56
disappointment,0.39,0.46,0.42


In [146]:
y_pred_labels_test_GE_no_neu_opt.shape, y_pred_labels_val_GE_no_neu_opt.shape

((3821, 27), (3834, 27))

In [148]:
# Handling empty predictions for test
y_pred_labels_test_GE_no_neu_opt_h = np.copy(y_pred_labels_test_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_test_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_test_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt_h, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.62,0.82,0.71
amusement,0.79,0.93,0.85
anger,0.46,0.63,0.54
annoyance,0.43,0.48,0.46
approval,0.49,0.5,0.5
caring,0.42,0.58,0.49
confusion,0.41,0.63,0.49
curiosity,0.61,0.83,0.71
desire,0.67,0.59,0.63
disappointment,0.32,0.38,0.35


In [149]:
# Handling empty predictions for val
y_pred_labels_val_GE_no_neu_opt_h = np.copy(y_pred_labels_val_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_val_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_val_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt_h, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.62,0.84,0.71
amusement,0.78,0.87,0.82
anger,0.45,0.62,0.52
annoyance,0.41,0.54,0.47
approval,0.56,0.45,0.5
caring,0.49,0.65,0.56
confusion,0.39,0.55,0.45
curiosity,0.56,0.81,0.67
desire,0.52,0.62,0.56
disappointment,0.39,0.46,0.42


### Saving the Model

In [154]:
PATH = "BERT_GoEmotion_no_neu_1.pt"
torch.save(model_GE_no_neu.state_dict(), PATH)


#### Loading the model

In [158]:
PATH = "BERT_GoEmotion_no_neu_1.pt"
model_GE_no_neu_1 = BERTClass()
model_GE_no_neu_1.load_state_dict(torch.load(PATH))
model_GE_no_neu_1.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

###  Model 2

In [159]:
MAX_LEN = max_length
TRAIN_BATCH_SIZE = 16
VALIDATION_BATCH_SIZE = 16
TEST_BATCH_SIZE =16
EPOCHS = 4
LEARNING_RATE = 5e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

### Dataset & Data Loaders

In [160]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe[GE_taxonomy_no_neu]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            verbose = True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [161]:
train_dataset = CustomDataset(
  df_train_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

validation_dataset = CustomDataset(
  df_val_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

test_dataset = CustomDataset(
  df_test_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

In [162]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }
validation_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

train_loader = DataLoader(train_dataset, **train_params)
validation_loader = DataLoader(validation_dataset, **validation_params)
test_loader = DataLoader(test_dataset, **test_params)

In [163]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Model

In [164]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 27)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output)
        output = self.l3(output)
        return output

model_GE_no_neu = BERTClass()
model_GE_no_neu.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [165]:
import random
import time

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params =  model_GE_no_neu.parameters(), lr=LEARNING_RATE)

In [166]:
def train(epoch,model):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')
        
        # Update parameters and the learning rate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

!pip install GPUtil

### Start Training

In [167]:
for epoch in range(EPOCHS):
    train(epoch, model_GE_no_neu)

Epoch: 1, Loss:  0.7179248929023743
Epoch: 2, Loss:  0.08816198259592056
Epoch: 3, Loss:  0.08026814460754395
Epoch: 4, Loss:  0.08449606597423553


In [168]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [169]:
def validation(loader,model):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [170]:
# train results
#for epoch in range(EPOCHS):
y_pred_proba_train_GE_no_neu, targets_train = validation(train_loader,model_GE_no_neu) # epoch
y_pred_train_GE_no_neu = np.array(y_pred_proba_train_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_train, y_pred_train_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.7719619446169942
F1 Score (Micro) = 0.8663728122442533
F1 Score (Macro) = 0.8114794217016993


In [171]:
cr = classification_report(targets_train,y_pred_train_GE_no_neu,target_names= GE_taxonomy_no_neu)
print(cr)

                precision    recall  f1-score   support

    admiration       0.97      0.87      0.92      4130
     amusement       0.92      0.95      0.93      2328
         anger       0.79      0.91      0.84      1567
     annoyance       0.93      0.63      0.75      2470
      approval       0.94      0.82      0.87      2939
        caring       0.95      0.79      0.86      1087
     confusion       0.93      0.80      0.86      1368
     curiosity       0.92      0.87      0.90      2191
        desire       0.90      0.81      0.85       641
disappointment       0.96      0.59      0.73      1269
   disapproval       0.94      0.84      0.88      2022
       disgust       0.94      0.70      0.80       793
 embarrassment       0.91      0.74      0.82       303
    excitement       0.88      0.71      0.79       853
          fear       0.90      0.94      0.92       596
     gratitude       0.95      0.95      0.95      2662
         grief       0.91      0.13      0.23  

  _warn_prf(average, modifier, msg_start, len(result))


#Model evaluation
model_eval(targets_train, y_pred_train_GE_no_neu, GE_taxonomy_no_neu)

In [174]:
# test results
#for epoch in range(EPOCHS):
y_pred_proba_test_GE_no_neu, targets_test = validation(test_loader,model_GE_no_neu) # epoch
y_pred_test_GE_no_neu = np.array(y_pred_proba_test_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_test, y_pred_test_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.48233446741690655
F1 Score (Micro) = 0.6053785096552542
F1 Score (Macro) = 0.5317509629388136


In [175]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_test_GE_no_neu, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.75,0.65,0.7
amusement,0.81,0.89,0.84
anger,0.52,0.56,0.54
annoyance,0.47,0.3,0.37
approval,0.51,0.52,0.51
caring,0.62,0.39,0.48
confusion,0.47,0.47,0.47
curiosity,0.69,0.61,0.65
desire,0.71,0.47,0.57
disappointment,0.56,0.21,0.31


In [176]:
# validation results
#for epoch in range(EPOCHS):
y_pred_proba_val_GE_no_neu, targets_val = validation(validation_loader,model_GE_no_neu) # epoch
y_pred_val_GE_no_neu = np.array(y_pred_proba_val_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_val, y_pred_val_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.47730829420970267
F1 Score (Micro) = 0.6040835484628021
F1 Score (Macro) = 0.5418467647369914


In [177]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_val_GE_no_neu, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.77,0.69,0.73
amusement,0.79,0.83,0.81
anger,0.53,0.57,0.55
annoyance,0.43,0.29,0.35
approval,0.52,0.46,0.49
caring,0.73,0.44,0.55
confusion,0.55,0.39,0.46
curiosity,0.63,0.58,0.61
desire,0.61,0.56,0.59
disappointment,0.46,0.14,0.22


In [178]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_labels .shape[0]):
        for j in range(y_pred_labels .shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [179]:
# Generate labels for train, test and val
y_pred_train_labels_no_neu = proba_to_labels(y_pred_proba_train_GE_no_neu)

y_pred_test_labels_no_neu = proba_to_labels(y_pred_proba_test_GE_no_neu)

y_pred_val_labels_no_neu = proba_to_labels(y_pred_proba_val_GE_no_neu)

y_pred_labels = np.zeros_like(y_pred_proba)
y_pred_labels.shape[0]

In [180]:
# Function that computes labels from probabilities and optimizes the threshold that maximizes f1-score
def proba_to_labels_opt(y_true, y_pred_proba):
    
    '''
    Inputs:
        y_true: Ground truth labels
        y_pred_proba: predicted probabilities
        
    Outputs :
        best_y_pred_labels: preticted labels associated with best threshold
        best_t: best threshold
        best_macro_f1: macro f1-score associated with predicted labels
    '''
    
    # range of possible thresholds
    thresholds = np.arange(0.1, 0.99, 0.01)
    
    # Computing threshold that maximizes macro f1-score 
    best_y_pred_labels = np.zeros_like(y_pred_proba)
    best_t = 0
    best_macro_f1 = 0
    
    # Iterating through possible thresholds
    for t in thresholds:
        
        y_pred_labels = proba_to_labels(y_pred_proba, t)
                             
        _, _, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
        
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_t = t
            best_y_pred_labels = y_pred_labels
            
    return best_y_pred_labels, best_t, best_macro_f1

In [181]:
##### Compute label predictions and corresponding optimal thresholds 
y_pred_labels_test_GE_no_neu_opt, threshold_test_GE_no_neu_opt, macro_f1_test_GE_no_neu_opt = proba_to_labels_opt(y_test_no_neu, y_pred_proba_test_GE_no_neu)
print("The model's threshold is {}".format(threshold_test_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_test_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

The model's threshold is 0.19999999999999996
The model's best macro-f1 is 0.5459531356294162


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [182]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_val_GE_no_neu_opt, threshold_val_GE_no_neu_opt, macro_f1_val_GE_no_neu_opt = proba_to_labels_opt(y_val_no_neu, y_pred_proba_val_GE_no_neu)
print("The model's threshold is {}".format(threshold_val_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_val_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.3699999999999999
The model's best macro-f1 is 0.5576942220998831


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [183]:
#####  Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.69,0.75,0.72
amusement,0.77,0.94,0.85
anger,0.42,0.66,0.51
annoyance,0.36,0.57,0.44
approval,0.38,0.66,0.48
caring,0.44,0.54,0.48
confusion,0.38,0.65,0.48
curiosity,0.59,0.82,0.69
desire,0.52,0.55,0.53
disappointment,0.42,0.38,0.4


In [184]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.74,0.73,0.74
amusement,0.77,0.86,0.82
anger,0.5,0.63,0.55
annoyance,0.4,0.4,0.4
approval,0.48,0.52,0.5
caring,0.64,0.48,0.55
confusion,0.49,0.44,0.46
curiosity,0.61,0.7,0.65
desire,0.58,0.56,0.57
disappointment,0.41,0.19,0.26


In [185]:
y_pred_labels_test_GE_no_neu_opt.shape, y_pred_labels_val_GE_no_neu_opt.shape

((3821, 27), (3834, 27))

In [186]:
# Handling empty predictions for test
y_pred_labels_test_GE_no_neu_opt_h = np.copy(y_pred_labels_test_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_test_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_test_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt_h, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.68,0.75,0.71
amusement,0.77,0.94,0.85
anger,0.42,0.66,0.51
annoyance,0.36,0.57,0.44
approval,0.38,0.66,0.48
caring,0.44,0.54,0.48
confusion,0.38,0.65,0.48
curiosity,0.59,0.82,0.69
desire,0.52,0.55,0.53
disappointment,0.42,0.38,0.4


In [187]:
# Handling empty predictions for val
y_pred_labels_val_GE_no_neu_opt_h = np.copy(y_pred_labels_val_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_val_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_val_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt_h, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.61,0.75,0.67
amusement,0.77,0.86,0.82
anger,0.5,0.63,0.55
annoyance,0.4,0.4,0.4
approval,0.48,0.52,0.5
caring,0.64,0.48,0.55
confusion,0.49,0.44,0.46
curiosity,0.61,0.7,0.65
desire,0.58,0.56,0.57
disappointment,0.41,0.19,0.26


### Saving the Model

In [188]:
PATH = "BERT_GoEmotion_no_neu_2.pt"
torch.save(model_GE_no_neu.state_dict(), PATH)


#### Loading the model

In [189]:
PATH = "BERT_GoEmotion_no_neu_2.pt"
model_GE_no_neu_2 = BERTClass()
model_GE_no_neu_2.load_state_dict(torch.load(PATH))
model_GE_no_neu_2.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

###  Model 3

In [190]:
MAX_LEN = max_length
TRAIN_BATCH_SIZE = 16
VALIDATION_BATCH_SIZE = 16
TEST_BATCH_SIZE =16
EPOCHS = 5
LEARNING_RATE = 3e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

### Dataset & Data Loaders

In [191]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe[GE_taxonomy_no_neu]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            verbose = True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [192]:
train_dataset = CustomDataset(
  df_train_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

validation_dataset = CustomDataset(
  df_val_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

test_dataset = CustomDataset(
  df_test_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

In [193]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }
validation_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

train_loader = DataLoader(train_dataset, **train_params)
validation_loader = DataLoader(validation_dataset, **validation_params)
test_loader = DataLoader(test_dataset, **test_params)

In [194]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Model

In [195]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 27)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output)
        output = self.l3(output)
        return output

model_GE_no_neu = BERTClass()
model_GE_no_neu.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [196]:
import random
import time

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params =  model_GE_no_neu.parameters(), lr=LEARNING_RATE)

In [197]:
def train(epoch,model):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')
        
        # Update parameters and the learning rate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

!pip install GPUtil

### Start Training

In [198]:
for epoch in range(EPOCHS):
    train(epoch, model_GE_no_neu)

Epoch: 1, Loss:  0.7774040102958679
Epoch: 2, Loss:  0.0907348021864891
Epoch: 3, Loss:  0.055995747447013855
Epoch: 4, Loss:  0.046649884432554245
Epoch: 5, Loss:  0.04857327789068222


In [199]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [200]:
def validation(loader,model):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [201]:
# train results
#for epoch in range(EPOCHS):
y_pred_proba_train_GE_no_neu, targets_train = validation(train_loader,model_GE_no_neu) # epoch
y_pred_train_GE_no_neu = np.array(y_pred_proba_train_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_train, y_pred_train_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.8045901853728709
F1 Score (Micro) = 0.8922344810004674
F1 Score (Macro) = 0.8200883561437196


In [202]:
cr = classification_report(targets_train,y_pred_train_GE_no_neu,target_names= GE_taxonomy_no_neu)
print(cr)

                precision    recall  f1-score   support

    admiration       0.96      0.95      0.95      4130
     amusement       0.97      0.90      0.94      2328
         anger       0.76      0.94      0.84      1567
     annoyance       0.98      0.67      0.79      2470
      approval       0.96      0.85      0.90      2939
        caring       0.94      0.89      0.92      1087
     confusion       0.96      0.82      0.88      1368
     curiosity       0.96      0.85      0.90      2191
        desire       0.93      0.80      0.86       641
disappointment       0.94      0.70      0.80      1269
   disapproval       0.94      0.88      0.91      2022
       disgust       0.95      0.77      0.85       793
 embarrassment       0.93      0.82      0.87       303
    excitement       0.90      0.78      0.84       853
          fear       0.97      0.93      0.95       596
     gratitude       0.99      0.95      0.97      2662
         grief       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#Model evaluation
model_eval(targets_train, y_pred_train_GE_no_neu, GE_taxonomy_no_neu)

In [203]:
# test results
#for epoch in range(EPOCHS):
y_pred_proba_test_GE_no_neu, targets_test = validation(test_loader,model_GE_no_neu) # epoch
y_pred_test_GE_no_neu = np.array(y_pred_proba_test_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_test, y_pred_test_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.48233446741690655
F1 Score (Micro) = 0.6093695777906304
F1 Score (Macro) = 0.5159872939629104


In [204]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_test_GE_no_neu, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.7,0.73,0.71
amusement,0.85,0.84,0.84
anger,0.49,0.57,0.53
annoyance,0.56,0.31,0.4
approval,0.55,0.51,0.53
caring,0.55,0.47,0.51
confusion,0.49,0.48,0.48
curiosity,0.71,0.61,0.65
desire,0.7,0.45,0.54
disappointment,0.39,0.26,0.31


In [205]:
# validation results
#for epoch in range(EPOCHS):
y_pred_proba_val_GE_no_neu, targets_val = validation(validation_loader,model_GE_no_neu) # epoch
y_pred_val_GE_no_neu = np.array(y_pred_proba_val_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_val, y_pred_val_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.48174230568596765
F1 Score (Micro) = 0.6042170756999655
F1 Score (Macro) = 0.5134002391308624


In [206]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_val_GE_no_neu, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.73,0.77,0.75
amusement,0.82,0.79,0.8
anger,0.47,0.58,0.52
annoyance,0.45,0.26,0.33
approval,0.52,0.41,0.46
caring,0.61,0.54,0.57
confusion,0.58,0.41,0.48
curiosity,0.71,0.61,0.66
desire,0.65,0.53,0.59
disappointment,0.46,0.3,0.36


In [207]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_labels .shape[0]):
        for j in range(y_pred_labels .shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [208]:
# Generate labels for train, test and val
y_pred_train_labels_no_neu = proba_to_labels(y_pred_proba_train_GE_no_neu)

y_pred_test_labels_no_neu = proba_to_labels(y_pred_proba_test_GE_no_neu)

y_pred_val_labels_no_neu = proba_to_labels(y_pred_proba_val_GE_no_neu)

y_pred_labels = np.zeros_like(y_pred_proba)
y_pred_labels.shape[0]

In [209]:
# Function that computes labels from probabilities and optimizes the threshold that maximizes f1-score
def proba_to_labels_opt(y_true, y_pred_proba):
    
    '''
    Inputs:
        y_true: Ground truth labels
        y_pred_proba: predicted probabilities
        
    Outputs :
        best_y_pred_labels: preticted labels associated with best threshold
        best_t: best threshold
        best_macro_f1: macro f1-score associated with predicted labels
    '''
    
    # range of possible thresholds
    thresholds = np.arange(0.1, 0.99, 0.01)
    
    # Computing threshold that maximizes macro f1-score 
    best_y_pred_labels = np.zeros_like(y_pred_proba)
    best_t = 0
    best_macro_f1 = 0
    
    # Iterating through possible thresholds
    for t in thresholds:
        
        y_pred_labels = proba_to_labels(y_pred_proba, t)
                             
        _, _, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
        
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_t = t
            best_y_pred_labels = y_pred_labels
            
    return best_y_pred_labels, best_t, best_macro_f1

In [210]:
##### Compute label predictions and corresponding optimal thresholds 
y_pred_labels_test_GE_no_neu_opt, threshold_test_GE_no_neu_opt, macro_f1_test_GE_no_neu_opt = proba_to_labels_opt(y_test_no_neu, y_pred_proba_test_GE_no_neu)
print("The model's threshold is {}".format(threshold_test_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_test_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

The model's threshold is 0.23999999999999994
The model's best macro-f1 is 0.5413627705783219


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [211]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_val_GE_no_neu_opt, threshold_val_GE_no_neu_opt, macro_f1_val_GE_no_neu_opt = proba_to_labels_opt(y_val_no_neu, y_pred_proba_val_GE_no_neu)
print("The model's threshold is {}".format(threshold_val_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_val_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.2899999999999999
The model's best macro-f1 is 0.5392615875690336


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [212]:
#####  Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.64,0.8,0.71
amusement,0.77,0.9,0.83
anger,0.44,0.65,0.53
annoyance,0.42,0.41,0.42
approval,0.45,0.6,0.51
caring,0.44,0.57,0.49
confusion,0.43,0.62,0.51
curiosity,0.67,0.71,0.69
desire,0.65,0.53,0.58
disappointment,0.31,0.39,0.35


In [213]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.67,0.82,0.74
amusement,0.79,0.83,0.81
anger,0.43,0.63,0.51
annoyance,0.4,0.38,0.39
approval,0.46,0.5,0.48
caring,0.55,0.63,0.59
confusion,0.5,0.5,0.5
curiosity,0.67,0.7,0.69
desire,0.6,0.6,0.6
disappointment,0.38,0.39,0.38


In [214]:
y_pred_labels_test_GE_no_neu_opt.shape, y_pred_labels_val_GE_no_neu_opt.shape

((3821, 27), (3834, 27))

In [215]:
# Handling empty predictions for test
y_pred_labels_test_GE_no_neu_opt_h = np.copy(y_pred_labels_test_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_test_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_test_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt_h, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.63,0.81,0.71
amusement,0.77,0.9,0.83
anger,0.44,0.65,0.53
annoyance,0.42,0.41,0.42
approval,0.45,0.6,0.51
caring,0.44,0.57,0.49
confusion,0.43,0.62,0.51
curiosity,0.67,0.71,0.69
desire,0.65,0.53,0.58
disappointment,0.31,0.39,0.35


In [216]:
# Handling empty predictions for val
y_pred_labels_val_GE_no_neu_opt_h = np.copy(y_pred_labels_val_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_val_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_val_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt_h, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.62,0.82,0.71
amusement,0.79,0.83,0.81
anger,0.43,0.63,0.51
annoyance,0.4,0.38,0.39
approval,0.46,0.5,0.48
caring,0.55,0.63,0.59
confusion,0.5,0.5,0.5
curiosity,0.67,0.7,0.69
desire,0.6,0.6,0.6
disappointment,0.38,0.39,0.38


### Saving the Model

In [217]:
PATH = "BERT_GoEmotion_no_neu_3.pt"
torch.save(model_GE_no_neu.state_dict(), PATH)


#### Loading the model

In [218]:
PATH = "BERT_GoEmotion_no_neu_3.pt"
model_GE_no_neu_2 = BERTClass()
model_GE_no_neu_2.load_state_dict(torch.load(PATH))
model_GE_no_neu_2.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

###  Model 4 - Final


In [248]:
MAX_LEN = max_length
TRAIN_BATCH_SIZE = 8
VALIDATION_BATCH_SIZE = 8
TEST_BATCH_SIZE =8
EPOCHS = 4
LEARNING_RATE = 3e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

### Dataset & Data Loaders

In [249]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe[GE_taxonomy_no_neu]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            verbose = True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [250]:
train_dataset = CustomDataset(
  df_train_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

validation_dataset = CustomDataset(
  df_val_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

test_dataset = CustomDataset(
  df_test_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

In [251]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }
validation_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

train_loader = DataLoader(train_dataset, **train_params)
validation_loader = DataLoader(validation_dataset, **validation_params)
test_loader = DataLoader(test_dataset, **test_params)

In [252]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Model

In [253]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 27)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output)
        output = self.l3(output)
        return output

model_GE_no_neu = BERTClass()
model_GE_no_neu.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [254]:
import random
import time

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params =  model_GE_no_neu.parameters(), lr=LEARNING_RATE)

In [255]:
def train(epoch,model):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')
        
        # Update parameters and the learning rate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

!pip install GPUtil

### Start Training

In [256]:
for epoch in range(EPOCHS):
    train(epoch, model_GE_no_neu)

Epoch: 1, Loss:  0.7106543779373169
Epoch: 2, Loss:  0.08805622905492783
Epoch: 3, Loss:  0.06167597696185112
Epoch: 4, Loss:  0.03909778222441673


In [257]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [258]:
def validation(loader,model):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [259]:
# train results
#for epoch in range(EPOCHS):
y_pred_proba_train_GE_no_neu, targets_train = validation(train_loader,model_GE_no_neu) # epoch
y_pred_train_GE_no_neu = np.array(y_pred_proba_train_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_train, y_pred_train_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.7873606434105993
F1 Score (Micro) = 0.8786367060361859
F1 Score (Macro) = 0.8183698233818754


In [260]:
cr = classification_report(targets_train,y_pred_train_GE_no_neu,target_names= GE_taxonomy_no_neu)
print(cr)

                precision    recall  f1-score   support

    admiration       0.93      0.93      0.93      4130
     amusement       0.95      0.94      0.95      2328
         anger       0.90      0.88      0.89      1567
     annoyance       0.87      0.82      0.84      2470
      approval       0.97      0.77      0.86      2939
        caring       0.94      0.85      0.89      1087
     confusion       0.96      0.72      0.82      1368
     curiosity       0.82      0.95      0.88      2191
        desire       0.92      0.81      0.86       641
disappointment       0.91      0.68      0.78      1269
   disapproval       0.94      0.85      0.89      2022
       disgust       0.93      0.73      0.82       793
 embarrassment       0.96      0.70      0.81       303
    excitement       0.92      0.71      0.80       853
          fear       0.94      0.89      0.92       596
     gratitude       0.94      0.97      0.95      2662
         grief       0.95      0.27      0.42  

  _warn_prf(average, modifier, msg_start, len(result))


#Model evaluation
model_eval(targets_train, y_pred_train_GE_no_neu, GE_taxonomy_no_neu)

In [261]:
# test results
#for epoch in range(EPOCHS):
y_pred_proba_test_GE_no_neu, targets_test = validation(test_loader,model_GE_no_neu) # epoch
y_pred_test_GE_no_neu = np.array(y_pred_proba_test_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_test, y_pred_test_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.48626014132426065
F1 Score (Micro) = 0.6128771240319039
F1 Score (Macro) = 0.5262535752660216


In [262]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_test_GE_no_neu, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.69,0.77,0.72
amusement,0.82,0.88,0.85
anger,0.57,0.48,0.52
annoyance,0.43,0.47,0.45
approval,0.57,0.44,0.5
caring,0.51,0.4,0.45
confusion,0.54,0.42,0.48
curiosity,0.63,0.8,0.71
desire,0.69,0.51,0.58
disappointment,0.44,0.27,0.34


In [263]:
# validation results
#for epoch in range(EPOCHS):
y_pred_proba_val_GE_no_neu, targets_val = validation(validation_loader,model_GE_no_neu) # epoch
y_pred_val_GE_no_neu = np.array(y_pred_proba_val_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_val, y_pred_val_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.4874804381846635
F1 Score (Micro) = 0.613950276243094
F1 Score (Macro) = 0.5317742847250037


In [264]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_val_GE_no_neu, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.69,0.8,0.74
amusement,0.82,0.84,0.83
anger,0.63,0.53,0.58
annoyance,0.4,0.45,0.42
approval,0.57,0.39,0.46
caring,0.64,0.45,0.53
confusion,0.58,0.4,0.47
curiosity,0.6,0.73,0.66
desire,0.62,0.57,0.59
disappointment,0.48,0.28,0.35


In [265]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_labels .shape[0]):
        for j in range(y_pred_labels .shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [266]:
# Generate labels for train, test and val
y_pred_train_labels_no_neu = proba_to_labels(y_pred_proba_train_GE_no_neu)

y_pred_test_labels_no_neu = proba_to_labels(y_pred_proba_test_GE_no_neu)

y_pred_val_labels_no_neu = proba_to_labels(y_pred_proba_val_GE_no_neu)

y_pred_labels = np.zeros_like(y_pred_proba)
y_pred_labels.shape[0]

In [267]:
# Function that computes labels from probabilities and optimizes the threshold that maximizes f1-score
def proba_to_labels_opt(y_true, y_pred_proba):
    
    '''
    Inputs:
        y_true: Ground truth labels
        y_pred_proba: predicted probabilities
        
    Outputs :
        best_y_pred_labels: preticted labels associated with best threshold
        best_t: best threshold
        best_macro_f1: macro f1-score associated with predicted labels
    '''
    
    # range of possible thresholds
    thresholds = np.arange(0.1, 0.99, 0.01)
    
    # Computing threshold that maximizes macro f1-score 
    best_y_pred_labels = np.zeros_like(y_pred_proba)
    best_t = 0
    best_macro_f1 = 0
    
    # Iterating through possible thresholds
    for t in thresholds:
        
        y_pred_labels = proba_to_labels(y_pred_proba, t)
                             
        _, _, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
        
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_t = t
            best_y_pred_labels = y_pred_labels
            
    return best_y_pred_labels, best_t, best_macro_f1

In [268]:
##### Compute label predictions and corresponding optimal thresholds 
y_pred_labels_test_GE_no_neu_opt, threshold_test_GE_no_neu_opt, macro_f1_test_GE_no_neu_opt = proba_to_labels_opt(y_test_no_neu, y_pred_proba_test_GE_no_neu)
print("The model's threshold is {}".format(threshold_test_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_test_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.21999999999999995
The model's best macro-f1 is 0.5502041865948621


In [269]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_val_GE_no_neu_opt, threshold_val_GE_no_neu_opt, macro_f1_val_GE_no_neu_opt = proba_to_labels_opt(y_val_no_neu, y_pred_proba_val_GE_no_neu)
print("The model's threshold is {}".format(threshold_val_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_val_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.20999999999999996
The model's best macro-f1 is 0.5518552626337221


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [270]:
#####  Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.59,0.84,0.7
amusement,0.78,0.93,0.85
anger,0.46,0.62,0.53
annoyance,0.33,0.62,0.43
approval,0.47,0.58,0.52
caring,0.42,0.53,0.47
confusion,0.44,0.64,0.52
curiosity,0.54,0.87,0.67
desire,0.57,0.58,0.57
disappointment,0.3,0.42,0.35


In [271]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.6,0.87,0.71
amusement,0.78,0.9,0.84
anger,0.48,0.68,0.56
annoyance,0.33,0.67,0.45
approval,0.47,0.51,0.49
caring,0.53,0.58,0.55
confusion,0.44,0.59,0.5
curiosity,0.51,0.86,0.64
desire,0.53,0.68,0.59
disappointment,0.32,0.42,0.37


In [272]:
y_pred_labels_test_GE_no_neu_opt.shape, y_pred_labels_val_GE_no_neu_opt.shape

((3821, 27), (3834, 27))

In [273]:
# Handling empty predictions for test
y_pred_labels_test_GE_no_neu_opt_h = np.copy(y_pred_labels_test_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_test_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_test_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt_h, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.59,0.84,0.69
amusement,0.78,0.93,0.85
anger,0.46,0.62,0.53
annoyance,0.33,0.62,0.43
approval,0.47,0.58,0.52
caring,0.42,0.53,0.47
confusion,0.44,0.64,0.52
curiosity,0.54,0.87,0.67
desire,0.57,0.58,0.57
disappointment,0.3,0.42,0.35


In [274]:
# Handling empty predictions for val
y_pred_labels_val_GE_no_neu_opt_h = np.copy(y_pred_labels_val_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_val_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_val_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt_h, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.59,0.88,0.7
amusement,0.78,0.9,0.84
anger,0.48,0.68,0.56
annoyance,0.33,0.67,0.45
approval,0.47,0.51,0.49
caring,0.53,0.58,0.55
confusion,0.44,0.59,0.5
curiosity,0.51,0.86,0.64
desire,0.53,0.68,0.59
disappointment,0.32,0.42,0.37


### Saving the Model

In [275]:
PATH = "BERT_GoEmotion_no_neu_4.pt"
torch.save(model_GE_no_neu.state_dict(), PATH)


#### Loading the model

In [276]:
PATH = "BERT_GoEmotion_no_neu_4.pt"
model_GE_no_neu_2 = BERTClass()
model_GE_no_neu_2.load_state_dict(torch.load(PATH))
model_GE_no_neu_2.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

###  Model 5

In [57]:
MAX_LEN = max_length
TRAIN_BATCH_SIZE = 16
VALIDATION_BATCH_SIZE = 16
TEST_BATCH_SIZE =16
EPOCHS = 4
LEARNING_RATE = 3e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

### Dataset & Data Loaders

In [58]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe[GE_taxonomy_no_neu]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            verbose = True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [59]:
train_dataset = CustomDataset(
  df_train_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

validation_dataset = CustomDataset(
  df_val_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

test_dataset = CustomDataset(
  df_test_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

In [60]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }
validation_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

train_loader = DataLoader(train_dataset, **train_params)
validation_loader = DataLoader(validation_dataset, **validation_params)
test_loader = DataLoader(test_dataset, **test_params)

In [61]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Model

In [62]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 27)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output)
        output = self.l3(output)
        return output

model_GE_no_neu = BERTClass()
model_GE_no_neu.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [63]:
import random
import time

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params =  model_GE_no_neu.parameters(), lr=LEARNING_RATE)

In [64]:
def train(epoch,model):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')
        
        # Update parameters and the learning rate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

!pip install GPUtil

### Start Training

In [65]:
for epoch in range(EPOCHS):
    train(epoch, model_GE_no_neu)

Epoch: 1, Loss:  0.7002807259559631
Epoch: 2, Loss:  0.06643776595592499
Epoch: 3, Loss:  0.08390701562166214
Epoch: 4, Loss:  0.06236373260617256


In [66]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [67]:
def validation(loader,model):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [68]:
# train results
#for epoch in range(EPOCHS):
y_pred_proba_train_GE_no_neu, targets_train = validation(train_loader,model_GE_no_neu) # epoch
y_pred_train_GE_no_neu = np.array(y_pred_proba_train_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_train, y_pred_train_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.775296694674208
F1 Score (Micro) = 0.8709829664907539
F1 Score (Macro) = 0.7912152313709265


In [69]:
cr = classification_report(targets_train,y_pred_train_GE_no_neu,target_names= GE_taxonomy_no_neu)
print(cr)

                precision    recall  f1-score   support

    admiration       0.94      0.94      0.94      4130
     amusement       0.94      0.94      0.94      2328
         anger       0.88      0.87      0.88      1567
     annoyance       0.89      0.75      0.82      2470
      approval       0.96      0.79      0.87      2939
        caring       0.91      0.83      0.87      1087
     confusion       0.97      0.63      0.76      1368
     curiosity       0.85      0.92      0.88      2191
        desire       0.94      0.78      0.85       641
disappointment       0.91      0.64      0.75      1269
   disapproval       0.94      0.83      0.88      2022
       disgust       0.90      0.74      0.81       793
 embarrassment       0.92      0.71      0.80       303
    excitement       0.91      0.72      0.80       853
          fear       0.90      0.92      0.91       596
     gratitude       0.99      0.94      0.96      2662
         grief       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#Model evaluation
model_eval(targets_train, y_pred_train_GE_no_neu, GE_taxonomy_no_neu)

In [70]:
# test results
#for epoch in range(EPOCHS):
y_pred_proba_test_GE_no_neu, targets_test = validation(test_loader,model_GE_no_neu) # epoch
y_pred_test_GE_no_neu = np.array(y_pred_proba_test_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_test, y_pred_test_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.484689871761319
F1 Score (Micro) = 0.6183597390493942
F1 Score (Macro) = 0.532892420946265


In [71]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_test_GE_no_neu, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.71,0.73,0.72
amusement,0.82,0.88,0.85
anger,0.6,0.49,0.54
annoyance,0.48,0.45,0.47
approval,0.55,0.46,0.5
caring,0.54,0.5,0.52
confusion,0.63,0.39,0.48
curiosity,0.65,0.77,0.71
desire,0.7,0.45,0.54
disappointment,0.49,0.27,0.35


In [72]:
# validation results
#for epoch in range(EPOCHS):
y_pred_proba_val_GE_no_neu, targets_val = validation(validation_loader,model_GE_no_neu) # epoch
y_pred_val_GE_no_neu = np.array(y_pred_proba_val_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_val, y_pred_val_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.48043818466353677
F1 Score (Micro) = 0.6138682745825603
F1 Score (Macro) = 0.5232052863302036


In [73]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_val_GE_no_neu, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.72,0.77,0.75
amusement,0.8,0.86,0.83
anger,0.65,0.53,0.58
annoyance,0.42,0.42,0.42
approval,0.58,0.4,0.48
caring,0.65,0.53,0.58
confusion,0.65,0.35,0.45
curiosity,0.6,0.73,0.66
desire,0.67,0.47,0.55
disappointment,0.51,0.26,0.35


In [74]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_labels .shape[0]):
        for j in range(y_pred_labels .shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [75]:
# Generate labels for train, test and val
y_pred_train_labels_no_neu = proba_to_labels(y_pred_proba_train_GE_no_neu)

y_pred_test_labels_no_neu = proba_to_labels(y_pred_proba_test_GE_no_neu)

y_pred_val_labels_no_neu = proba_to_labels(y_pred_proba_val_GE_no_neu)

y_pred_labels = np.zeros_like(y_pred_proba)
y_pred_labels.shape[0]

In [76]:
# Function that computes labels from probabilities and optimizes the threshold that maximizes f1-score
def proba_to_labels_opt(y_true, y_pred_proba):
    
    '''
    Inputs:
        y_true: Ground truth labels
        y_pred_proba: predicted probabilities
        
    Outputs :
        best_y_pred_labels: preticted labels associated with best threshold
        best_t: best threshold
        best_macro_f1: macro f1-score associated with predicted labels
    '''
    
    # range of possible thresholds
    thresholds = np.arange(0.1, 0.99, 0.01)
    
    # Computing threshold that maximizes macro f1-score 
    best_y_pred_labels = np.zeros_like(y_pred_proba)
    best_t = 0
    best_macro_f1 = 0
    
    # Iterating through possible thresholds
    for t in thresholds:
        
        y_pred_labels = proba_to_labels(y_pred_proba, t)
                             
        _, _, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
        
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_t = t
            best_y_pred_labels = y_pred_labels
            
    return best_y_pred_labels, best_t, best_macro_f1

In [77]:
##### Compute label predictions and corresponding optimal thresholds 
y_pred_labels_test_GE_no_neu_opt, threshold_test_GE_no_neu_opt, macro_f1_test_GE_no_neu_opt = proba_to_labels_opt(y_test_no_neu, y_pred_proba_test_GE_no_neu)
print("The model's threshold is {}".format(threshold_test_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_test_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.2699999999999999
The model's best macro-f1 is 0.5601015799545528


In [78]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_val_GE_no_neu_opt, threshold_val_GE_no_neu_opt, macro_f1_val_GE_no_neu_opt = proba_to_labels_opt(y_val_no_neu, y_pred_proba_val_GE_no_neu)
print("The model's threshold is {}".format(threshold_val_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_val_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

The model's threshold is 0.20999999999999996
The model's best macro-f1 is 0.5491762262491049


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [79]:
#####  Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.65,0.81,0.72
amusement,0.78,0.93,0.85
anger,0.52,0.62,0.56
annoyance,0.39,0.59,0.47
approval,0.47,0.57,0.52
caring,0.45,0.59,0.51
confusion,0.48,0.54,0.5
curiosity,0.58,0.83,0.68
desire,0.68,0.53,0.59
disappointment,0.39,0.38,0.38


In [80]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.62,0.85,0.72
amusement,0.75,0.91,0.82
anger,0.47,0.64,0.54
annoyance,0.34,0.66,0.45
approval,0.45,0.53,0.48
caring,0.5,0.64,0.56
confusion,0.46,0.51,0.49
curiosity,0.54,0.85,0.66
desire,0.55,0.61,0.58
disappointment,0.37,0.46,0.41


In [81]:
y_pred_labels_test_GE_no_neu_opt.shape, y_pred_labels_val_GE_no_neu_opt.shape

((3821, 27), (3834, 27))

In [82]:
# Handling empty predictions for test
y_pred_labels_test_GE_no_neu_opt_h = np.copy(y_pred_labels_test_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_test_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_test_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt_h, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.62,0.82,0.71
amusement,0.78,0.93,0.85
anger,0.52,0.62,0.56
annoyance,0.39,0.59,0.47
approval,0.47,0.57,0.52
caring,0.45,0.59,0.51
confusion,0.48,0.54,0.5
curiosity,0.58,0.83,0.68
desire,0.68,0.53,0.59
disappointment,0.39,0.38,0.38


In [83]:
# Handling empty predictions for val
y_pred_labels_val_GE_no_neu_opt_h = np.copy(y_pred_labels_val_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_val_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_val_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt_h, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.6,0.85,0.7
amusement,0.75,0.91,0.82
anger,0.47,0.64,0.54
annoyance,0.34,0.66,0.45
approval,0.45,0.53,0.48
caring,0.5,0.64,0.56
confusion,0.46,0.51,0.49
curiosity,0.54,0.85,0.66
desire,0.55,0.61,0.58
disappointment,0.37,0.46,0.41


### Saving the Model

In [84]:
PATH = "BERT_GoEmotion_no_neu_3.pt"
torch.save(model_GE_no_neu.state_dict(), PATH)


#### Loading the model

In [85]:
PATH = "BERT_GoEmotion_no_neu_3.pt"
model_GE_no_neu_2 = BERTClass()
model_GE_no_neu_2.load_state_dict(torch.load(PATH))
model_GE_no_neu_2.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

###  Model 6

In [52]:
MAX_LEN = max_length
TRAIN_BATCH_SIZE = 16
VALIDATION_BATCH_SIZE = 16
TEST_BATCH_SIZE =16
EPOCHS = 5
LEARNING_RATE = 5e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

### Dataset & Data Loaders

In [61]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe[GE_taxonomy_no_neu]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            verbose = True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [62]:
train_dataset = CustomDataset(
  df_train_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

validation_dataset = CustomDataset(
  df_val_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

test_dataset = CustomDataset(
  df_test_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

In [63]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }
validation_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

train_loader = DataLoader(train_dataset, **train_params)
validation_loader = DataLoader(validation_dataset, **validation_params)
test_loader = DataLoader(test_dataset, **test_params)

In [64]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Model

In [65]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 27)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output)
        output = self.l3(output)
        return output

model_GE_no_neu = BERTClass()
model_GE_no_neu.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [66]:
import random
import time

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params =  model_GE_no_neu.parameters(), lr=LEARNING_RATE)

In [67]:
def train(epoch,model):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')
        
        # Update parameters and the learning rate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

!pip install GPUtil

### Start Training

In [68]:
for epoch in range(EPOCHS):
    train(epoch, model_GE_no_neu)

Epoch: 1, Loss:  0.7002807259559631
Epoch: 2, Loss:  0.05974995344877243
Epoch: 3, Loss:  0.05862605944275856
Epoch: 4, Loss:  0.0643826574087143
Epoch: 5, Loss:  0.02066124975681305


In [69]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [70]:
def validation(loader,model):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [71]:
# train results
#for epoch in range(EPOCHS):
y_pred_proba_train_GE_no_neu, targets_train = validation(train_loader,model_GE_no_neu) # epoch
y_pred_train_GE_no_neu = np.array(y_pred_proba_train_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_train, y_pred_train_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.8206427567267139
F1 Score (Micro) = 0.9027188612099645
F1 Score (Macro) = 0.8644079078488945


In [72]:
cr = classification_report(targets_train,y_pred_train_GE_no_neu,target_names= GE_taxonomy_no_neu)
print(cr)

                precision    recall  f1-score   support

    admiration       0.96      0.95      0.96      4130
     amusement       0.97      0.90      0.94      2328
         anger       0.92      0.88      0.90      1567
     annoyance       0.96      0.79      0.87      2470
      approval       0.98      0.82      0.89      2939
        caring       0.90      0.90      0.90      1087
     confusion       0.95      0.88      0.91      1368
     curiosity       0.98      0.81      0.89      2191
        desire       0.96      0.86      0.91       641
disappointment       0.95      0.74      0.83      1269
   disapproval       0.96      0.87      0.91      2022
       disgust       0.93      0.87      0.90       793
 embarrassment       0.94      0.83      0.88       303
    excitement       0.98      0.59      0.74       853
          fear       0.97      0.90      0.93       596
     gratitude       0.98      0.96      0.97      2662
         grief       0.74      0.45      0.56  

  _warn_prf(average, modifier, msg_start, len(result))


#Model evaluation
model_eval(targets_train, y_pred_train_GE_no_neu, GE_taxonomy_no_neu)

In [73]:
# test results
#for epoch in range(EPOCHS):
y_pred_proba_test_GE_no_neu, targets_test = validation(test_loader,model_GE_no_neu) # epoch
y_pred_test_GE_no_neu = np.array(y_pred_proba_test_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_test, y_pred_test_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.47291285003925676
F1 Score (Micro) = 0.5922556566363424
F1 Score (Macro) = 0.5409198580200681


In [74]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_test_GE_no_neu, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.67,0.71,0.69
amusement,0.84,0.82,0.83
anger,0.59,0.45,0.51
annoyance,0.48,0.41,0.44
approval,0.52,0.43,0.47
caring,0.45,0.47,0.46
confusion,0.47,0.55,0.51
curiosity,0.73,0.5,0.59
desire,0.73,0.48,0.58
disappointment,0.35,0.25,0.29


In [75]:
# validation results
#for epoch in range(EPOCHS):
y_pred_proba_val_GE_no_neu, targets_val = validation(validation_loader,model_GE_no_neu) # epoch
y_pred_val_GE_no_neu = np.array(y_pred_proba_val_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_val, y_pred_val_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.46609285341679707
F1 Score (Micro) = 0.5904872389791184
F1 Score (Macro) = 0.5168834020095286


In [76]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_val_GE_no_neu, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.69,0.75,0.72
amusement,0.85,0.8,0.82
anger,0.64,0.47,0.54
annoyance,0.39,0.34,0.36
approval,0.54,0.36,0.43
caring,0.53,0.5,0.52
confusion,0.45,0.44,0.45
curiosity,0.71,0.48,0.57
desire,0.64,0.55,0.59
disappointment,0.43,0.31,0.36


In [77]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_labels .shape[0]):
        for j in range(y_pred_labels .shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [78]:
# Generate labels for train, test and val
y_pred_train_labels_no_neu = proba_to_labels(y_pred_proba_train_GE_no_neu)

y_pred_test_labels_no_neu = proba_to_labels(y_pred_proba_test_GE_no_neu)

y_pred_val_labels_no_neu = proba_to_labels(y_pred_proba_val_GE_no_neu)

y_pred_labels = np.zeros_like(y_pred_proba)
y_pred_labels.shape[0]

In [79]:
# Function that computes labels from probabilities and optimizes the threshold that maximizes f1-score
def proba_to_labels_opt(y_true, y_pred_proba):
    
    '''
    Inputs:
        y_true: Ground truth labels
        y_pred_proba: predicted probabilities
        
    Outputs :
        best_y_pred_labels: preticted labels associated with best threshold
        best_t: best threshold
        best_macro_f1: macro f1-score associated with predicted labels
    '''
    
    # range of possible thresholds
    thresholds = np.arange(0.1, 0.99, 0.01)
    
    # Computing threshold that maximizes macro f1-score 
    best_y_pred_labels = np.zeros_like(y_pred_proba)
    best_t = 0
    best_macro_f1 = 0
    
    # Iterating through possible thresholds
    for t in thresholds:
        
        y_pred_labels = proba_to_labels(y_pred_proba, t)
                             
        _, _, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
        
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_t = t
            best_y_pred_labels = y_pred_labels
            
    return best_y_pred_labels, best_t, best_macro_f1

In [80]:
##### Compute label predictions and corresponding optimal thresholds 
y_pred_labels_test_GE_no_neu_opt, threshold_test_GE_no_neu_opt, macro_f1_test_GE_no_neu_opt = proba_to_labels_opt(y_test_no_neu, y_pred_proba_test_GE_no_neu)
print("The model's threshold is {}".format(threshold_test_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_test_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.3999999999999998
The model's best macro-f1 is 0.545392421365015


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [81]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_val_GE_no_neu_opt, threshold_val_GE_no_neu_opt, macro_f1_val_GE_no_neu_opt = proba_to_labels_opt(y_val_no_neu, y_pred_proba_val_GE_no_neu)
print("The model's threshold is {}".format(threshold_val_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_val_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

The model's threshold is 0.2799999999999999
The model's best macro-f1 is 0.534896051595123


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [82]:
#####  Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.65,0.73,0.69
amusement,0.83,0.86,0.84
anger,0.57,0.48,0.52
annoyance,0.47,0.46,0.47
approval,0.49,0.46,0.48
caring,0.42,0.52,0.47
confusion,0.43,0.58,0.5
curiosity,0.71,0.57,0.63
desire,0.69,0.49,0.58
disappointment,0.34,0.28,0.31


In [83]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.65,0.8,0.72
amusement,0.82,0.85,0.84
anger,0.56,0.51,0.53
annoyance,0.36,0.5,0.42
approval,0.49,0.47,0.48
caring,0.47,0.61,0.53
confusion,0.4,0.55,0.46
curiosity,0.65,0.6,0.63
desire,0.59,0.58,0.59
disappointment,0.38,0.4,0.39


In [84]:
y_pred_labels_test_GE_no_neu_opt.shape, y_pred_labels_val_GE_no_neu_opt.shape

((3821, 27), (3834, 27))

In [89]:
# Number of predictions with no positive label for test
sum(np.sum(y_pred_labels_test_GE_no_neu_opt, axis=1)==0)

112

In [90]:
# Number of predictions with no positive label for val
sum(np.sum(y_pred_labels_val_GE_no_neu_opt, axis=1)==0)

51

In [85]:
# Handling empty predictions for test
y_pred_labels_test_GE_no_neu_opt_h = np.copy(y_pred_labels_test_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_test_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_test_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt_h, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.56,0.75,0.64
amusement,0.83,0.86,0.84
anger,0.57,0.48,0.52
annoyance,0.47,0.46,0.47
approval,0.49,0.46,0.48
caring,0.42,0.52,0.47
confusion,0.43,0.58,0.5
curiosity,0.71,0.57,0.63
desire,0.69,0.49,0.58
disappointment,0.34,0.28,0.31


In [86]:
# Handling empty predictions for val
y_pred_labels_val_GE_no_neu_opt_h = np.copy(y_pred_labels_val_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_val_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_val_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt_h, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.6,0.81,0.69
amusement,0.82,0.85,0.84
anger,0.56,0.51,0.53
annoyance,0.36,0.5,0.42
approval,0.49,0.47,0.48
caring,0.47,0.61,0.53
confusion,0.4,0.55,0.46
curiosity,0.65,0.6,0.63
desire,0.59,0.58,0.59
disappointment,0.38,0.4,0.39


### Saving the Model

In [87]:
PATH = "BERT_GoEmotion_no_neu_6.pt"
torch.save(model_GE_no_neu.state_dict(), PATH)


#### Loading the model

In [88]:
PATH = "BERT_GoEmotion_no_neu_6.pt"
model_GE_no_neu_2 = BERTClass()
model_GE_no_neu_2.load_state_dict(torch.load(PATH))
model_GE_no_neu_2.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

###  Model 7 - Final

In [53]:
MAX_LEN = max_length
TRAIN_BATCH_SIZE = 16
VALIDATION_BATCH_SIZE = 16
TEST_BATCH_SIZE =16
EPOCHS = 5
LEARNING_RATE = 3e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

### Dataset & Data Loaders

In [54]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe[GE_taxonomy_no_neu]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            verbose = True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [55]:
train_dataset = CustomDataset(
  df_train_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

validation_dataset = CustomDataset(
  df_val_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

test_dataset = CustomDataset(
  df_test_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

In [56]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }
validation_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

train_loader = DataLoader(train_dataset, **train_params)
validation_loader = DataLoader(validation_dataset, **validation_params)
test_loader = DataLoader(test_dataset, **test_params)

In [57]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Model

In [58]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 27)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output)
        output = self.l3(output)
        return output

model_GE_no_neu = BERTClass()
model_GE_no_neu.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [59]:
import random
import time

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params =  model_GE_no_neu.parameters(), lr=LEARNING_RATE)

In [60]:
def train(epoch,model):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')
        
        # Update parameters and the learning rate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

!pip install GPUtil

### Start Training

In [61]:
for epoch in range(EPOCHS):
    train(epoch, model_GE_no_neu)

Epoch: 1, Loss:  0.7002807259559631
Epoch: 2, Loss:  0.06643776595592499
Epoch: 3, Loss:  0.08390701562166214
Epoch: 4, Loss:  0.06236373260617256
Epoch: 5, Loss:  0.020875992253422737


In [62]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [63]:
def validation(loader,model):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [64]:
# train results
#for epoch in range(EPOCHS):
y_pred_proba_train_GE_no_neu, targets_train = validation(train_loader,model_GE_no_neu) # epoch
y_pred_train_GE_no_neu = np.array(y_pred_proba_train_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_train, y_pred_train_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.822604374407428
F1 Score (Micro) = 0.9017441446572223
F1 Score (Macro) = 0.8518932949151469


In [65]:
cr = classification_report(targets_train,y_pred_train_GE_no_neu,target_names= GE_taxonomy_no_neu)
print(cr)

                precision    recall  f1-score   support

    admiration       0.94      0.96      0.95      4130
     amusement       0.97      0.92      0.95      2328
         anger       0.93      0.88      0.90      1567
     annoyance       0.95      0.81      0.87      2470
      approval       0.97      0.84      0.90      2939
        caring       0.90      0.89      0.89      1087
     confusion       0.89      0.88      0.88      1368
     curiosity       0.98      0.81      0.89      2191
        desire       0.94      0.84      0.89       641
disappointment       0.96      0.76      0.85      1269
   disapproval       0.97      0.85      0.91      2022
       disgust       0.94      0.84      0.89       793
 embarrassment       0.94      0.83      0.88       303
    excitement       0.97      0.68      0.80       853
          fear       0.94      0.93      0.93       596
     gratitude       0.98      0.95      0.96      2662
         grief       0.85      0.22      0.35  

  _warn_prf(average, modifier, msg_start, len(result))


#Model evaluation
model_eval(targets_train, y_pred_train_GE_no_neu, GE_taxonomy_no_neu)

In [66]:
# test results
#for epoch in range(EPOCHS):
y_pred_proba_test_GE_no_neu, targets_test = validation(test_loader,model_GE_no_neu) # epoch
y_pred_test_GE_no_neu = np.array(y_pred_proba_test_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_test, y_pred_test_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.47710023554043446
F1 Score (Micro) = 0.5979285464913301
F1 Score (Macro) = 0.526539494201798


In [67]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_test_GE_no_neu, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.66,0.76,0.71
amusement,0.82,0.81,0.81
anger,0.64,0.46,0.54
annoyance,0.44,0.42,0.43
approval,0.51,0.47,0.49
caring,0.46,0.56,0.51
confusion,0.46,0.62,0.53
curiosity,0.74,0.5,0.6
desire,0.67,0.49,0.57
disappointment,0.42,0.25,0.32


In [68]:
# validation results
#for epoch in range(EPOCHS):
y_pred_proba_val_GE_no_neu, targets_val = validation(validation_loader,model_GE_no_neu) # epoch
y_pred_val_GE_no_neu = np.array(y_pred_proba_val_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_val, y_pred_val_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.46557120500782473
F1 Score (Micro) = 0.5974654377880184
F1 Score (Macro) = 0.5195086825926943


In [69]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_val_GE_no_neu, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.68,0.8,0.73
amusement,0.84,0.81,0.82
anger,0.59,0.44,0.5
annoyance,0.43,0.42,0.42
approval,0.53,0.41,0.46
caring,0.51,0.54,0.53
confusion,0.46,0.54,0.5
curiosity,0.65,0.46,0.54
desire,0.67,0.57,0.62
disappointment,0.39,0.25,0.31


In [70]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_labels .shape[0]):
        for j in range(y_pred_labels .shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [71]:
# Generate labels for train, test and val
y_pred_train_labels_no_neu = proba_to_labels(y_pred_proba_train_GE_no_neu)

y_pred_test_labels_no_neu = proba_to_labels(y_pred_proba_test_GE_no_neu)

y_pred_val_labels_no_neu = proba_to_labels(y_pred_proba_val_GE_no_neu)

y_pred_labels = np.zeros_like(y_pred_proba)
y_pred_labels.shape[0]

In [72]:
# Function that computes labels from probabilities and optimizes the threshold that maximizes f1-score
def proba_to_labels_opt(y_true, y_pred_proba):
    
    '''
    Inputs:
        y_true: Ground truth labels
        y_pred_proba: predicted probabilities
        
    Outputs :
        best_y_pred_labels: preticted labels associated with best threshold
        best_t: best threshold
        best_macro_f1: macro f1-score associated with predicted labels
    '''
    
    # range of possible thresholds
    thresholds = np.arange(0.1, 0.99, 0.01)
    
    # Computing threshold that maximizes macro f1-score 
    best_y_pred_labels = np.zeros_like(y_pred_proba)
    best_t = 0
    best_macro_f1 = 0
    
    # Iterating through possible thresholds
    for t in thresholds:
        
        y_pred_labels = proba_to_labels(y_pred_proba, t)
                             
        _, _, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
        
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_t = t
            best_y_pred_labels = y_pred_labels
            
    return best_y_pred_labels, best_t, best_macro_f1

In [73]:
##### Compute label predictions and corresponding optimal thresholds 
y_pred_labels_test_GE_no_neu_opt, threshold_test_GE_no_neu_opt, macro_f1_test_GE_no_neu_opt = proba_to_labels_opt(y_test_no_neu, y_pred_proba_test_GE_no_neu)
print("The model's threshold is {}".format(threshold_test_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_test_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

The model's threshold is 0.2799999999999999
The model's best macro-f1 is 0.5611712760201688


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [74]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_val_GE_no_neu_opt, threshold_val_GE_no_neu_opt, macro_f1_val_GE_no_neu_opt = proba_to_labels_opt(y_val_no_neu, y_pred_proba_val_GE_no_neu)
print("The model's threshold is {}".format(threshold_val_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_val_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.22999999999999995
The model's best macro-f1 is 0.5480574759489554


In [75]:
#####  Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.62,0.83,0.71
amusement,0.78,0.89,0.83
anger,0.59,0.57,0.58
annoyance,0.39,0.52,0.45
approval,0.45,0.56,0.5
caring,0.38,0.59,0.46
confusion,0.4,0.72,0.51
curiosity,0.65,0.64,0.65
desire,0.66,0.53,0.59
disappointment,0.35,0.34,0.35


In [76]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.6,0.85,0.71
amusement,0.78,0.86,0.82
anger,0.51,0.52,0.51
annoyance,0.36,0.56,0.44
approval,0.45,0.54,0.49
caring,0.42,0.63,0.51
confusion,0.39,0.61,0.47
curiosity,0.59,0.62,0.61
desire,0.59,0.61,0.6
disappointment,0.39,0.42,0.4


In [77]:
y_pred_labels_test_GE_no_neu_opt.shape, y_pred_labels_val_GE_no_neu_opt.shape

((3821, 27), (3834, 27))

In [78]:
# Number of predictions with no positive label for test
sum(np.sum(y_pred_labels_test_GE_no_neu_opt, axis=1)==0)

34

In [79]:
# Number of predictions with no positive label for val
sum(np.sum(y_pred_labels_val_GE_no_neu_opt, axis=1)==0)

25

In [80]:
# Handling empty predictions for test
y_pred_labels_test_GE_no_neu_opt_h = np.copy(y_pred_labels_test_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_test_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_test_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt_h, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.59,0.83,0.69
amusement,0.78,0.89,0.83
anger,0.59,0.57,0.58
annoyance,0.39,0.52,0.45
approval,0.45,0.56,0.5
caring,0.38,0.59,0.46
confusion,0.4,0.72,0.51
curiosity,0.65,0.64,0.65
desire,0.66,0.53,0.59
disappointment,0.35,0.34,0.35


In [81]:
# Handling empty predictions for val
y_pred_labels_val_GE_no_neu_opt_h = np.copy(y_pred_labels_val_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_val_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_val_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt_h, GE_taxonomy_no_neu)

Unnamed: 0,Precision,Recall,F1
admiration,0.58,0.86,0.69
amusement,0.78,0.86,0.82
anger,0.51,0.52,0.51
annoyance,0.36,0.56,0.44
approval,0.45,0.54,0.49
caring,0.42,0.63,0.51
confusion,0.39,0.61,0.47
curiosity,0.59,0.62,0.61
desire,0.59,0.61,0.6
disappointment,0.39,0.42,0.4


### Saving the Model

In [82]:
PATH = "BERT_GoEmotion_no_neu_7.pt"
torch.save(model_GE_no_neu.state_dict(), PATH)


#### Loading the model

In [83]:
PATH = "BERT_GoEmotion_no_neu_7.pt"
model_GE_no_neu_2 = BERTClass()
model_GE_no_neu_2.load_state_dict(torch.load(PATH))
model_GE_no_neu_2.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

###  Model 8

In [53]:
MAX_LEN = max_length
TRAIN_BATCH_SIZE = 32
VALIDATION_BATCH_SIZE = 32
TEST_BATCH_SIZE =32
EPOCHS = 4
LEARNING_RATE = 3e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

### Dataset & Data Loaders

In [54]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe[GE_taxonomy_no_neu]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            verbose = True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [55]:
train_dataset = CustomDataset(
  df_train_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

validation_dataset = CustomDataset(
  df_val_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

test_dataset = CustomDataset(
  df_test_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

In [56]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }
validation_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

train_loader = DataLoader(train_dataset, **train_params)
validation_loader = DataLoader(validation_dataset, **validation_params)
test_loader = DataLoader(test_dataset, **test_params)

In [57]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Model

In [58]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 27)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output)
        output = self.l3(output)
        return output

model_GE_no_neu = BERTClass()
model_GE_no_neu.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [59]:
import random
import time

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params =  model_GE_no_neu.parameters(), lr=LEARNING_RATE)

In [60]:
def train(epoch,model):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')
        
        # Update parameters and the learning rate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

!pip install GPUtil

### Start Training

In [61]:
for epoch in range(EPOCHS):
    train(epoch, model_GE_no_neu)

Epoch: 1, Loss:  0.6986286044120789
Epoch: 2, Loss:  0.09339626133441925
Epoch: 3, Loss:  0.08087456971406937
Epoch: 4, Loss:  0.06715968251228333


In [62]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [63]:
def validation(loader,model):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [64]:
# train results
#for epoch in range(EPOCHS):
y_pred_proba_train_GE_no_neu, targets_train = validation(train_loader,model_GE_no_neu) # epoch
y_pred_train_GE_no_neu = np.array(y_pred_proba_train_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_train, y_pred_train_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.7315199267662732
F1 Score (Micro) = 0.8364384401032928
F1 Score (Macro) = 0.7146670816128124


In [65]:
cr = classification_report(targets_train,y_pred_train_GE_no_neu,target_names= GE_taxonomy_no_neu)
print(cr)

                precision    recall  f1-score   support

    admiration       0.93      0.92      0.93      4130
     amusement       0.94      0.91      0.92      2328
         anger       0.76      0.89      0.82      1567
     annoyance       0.88      0.61      0.72      2470
      approval       0.91      0.81      0.85      2939
        caring       0.92      0.73      0.81      1087
     confusion       0.94      0.64      0.76      1368
     curiosity       0.86      0.86      0.86      2191
        desire       0.92      0.71      0.80       641
disappointment       0.89      0.57      0.69      1269
   disapproval       0.95      0.76      0.84      2022
       disgust       0.81      0.70      0.75       793
 embarrassment       0.88      0.63      0.73       303
    excitement       0.88      0.63      0.74       853
          fear       0.85      0.91      0.88       596
     gratitude       0.98      0.94      0.96      2662
         grief       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#Model evaluation
model_eval(targets_train, y_pred_train_GE_no_neu, GE_taxonomy_no_neu)

In [66]:
# test results
#for epoch in range(EPOCHS):
y_pred_proba_test_GE_no_neu, targets_test = validation(test_loader,model_GE_no_neu) # epoch
y_pred_test_GE_no_neu = np.array(y_pred_proba_test_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_test, y_pred_test_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.48207275582308295
F1 Score (Micro) = 0.6084250837721398
F1 Score (Macro) = 0.4911609165910111


In [67]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_test_GE_no_neu, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.71,0.73,0.72
amusement,0.86,0.85,0.85
anger,0.52,0.57,0.54
annoyance,0.46,0.35,0.4
approval,0.48,0.51,0.49
caring,0.68,0.41,0.51
confusion,0.59,0.44,0.5
curiosity,0.71,0.69,0.7
desire,0.75,0.4,0.52
disappointment,0.46,0.23,0.3


In [68]:
# validation results
#for epoch in range(EPOCHS):
y_pred_proba_val_GE_no_neu, targets_val = validation(validation_loader,model_GE_no_neu) # epoch
y_pred_val_GE_no_neu = np.array(y_pred_proba_val_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_val, y_pred_val_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.4749608763693271
F1 Score (Micro) = 0.6031595201330324
F1 Score (Macro) = 0.49549219962876606


In [69]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_val_GE_no_neu, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.73,0.75,0.74
amusement,0.82,0.81,0.81
anger,0.56,0.58,0.57
annoyance,0.42,0.34,0.38
approval,0.54,0.47,0.5
caring,0.71,0.39,0.5
confusion,0.59,0.39,0.47
curiosity,0.65,0.63,0.64
desire,0.68,0.52,0.59
disappointment,0.51,0.25,0.34


In [70]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_labels .shape[0]):
        for j in range(y_pred_labels .shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [71]:
# Generate labels for train, test and val
y_pred_train_labels_no_neu = proba_to_labels(y_pred_proba_train_GE_no_neu)

y_pred_test_labels_no_neu = proba_to_labels(y_pred_proba_test_GE_no_neu)

y_pred_val_labels_no_neu = proba_to_labels(y_pred_proba_val_GE_no_neu)

y_pred_labels = np.zeros_like(y_pred_proba)
y_pred_labels.shape[0]

In [72]:
# Function that computes labels from probabilities and optimizes the threshold that maximizes f1-score
def proba_to_labels_opt(y_true, y_pred_proba):
    
    '''
    Inputs:
        y_true: Ground truth labels
        y_pred_proba: predicted probabilities
        
    Outputs :
        best_y_pred_labels: preticted labels associated with best threshold
        best_t: best threshold
        best_macro_f1: macro f1-score associated with predicted labels
    '''
    
    # range of possible thresholds
    thresholds = np.arange(0.1, 0.99, 0.01)
    
    # Computing threshold that maximizes macro f1-score 
    best_y_pred_labels = np.zeros_like(y_pred_proba)
    best_t = 0
    best_macro_f1 = 0
    
    # Iterating through possible thresholds
    for t in thresholds:
        
        y_pred_labels = proba_to_labels(y_pred_proba, t)
                             
        _, _, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
        
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_t = t
            best_y_pred_labels = y_pred_labels
            
    return best_y_pred_labels, best_t, best_macro_f1

In [73]:
##### Compute label predictions and corresponding optimal thresholds 
y_pred_labels_test_GE_no_neu_opt, threshold_test_GE_no_neu_opt, macro_f1_test_GE_no_neu_opt = proba_to_labels_opt(y_test_no_neu, y_pred_proba_test_GE_no_neu)
print("The model's threshold is {}".format(threshold_test_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_test_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.2599999999999999
The model's best macro-f1 is 0.5442656335498887


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [74]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_val_GE_no_neu_opt, threshold_val_GE_no_neu_opt, macro_f1_val_GE_no_neu_opt = proba_to_labels_opt(y_val_no_neu, y_pred_proba_val_GE_no_neu)
print("The model's threshold is {}".format(threshold_val_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_val_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.24999999999999992
The model's best macro-f1 is 0.5425394440746226


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
#####  Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.67,0.84,0.74
amusement,0.8,0.93,0.86
anger,0.45,0.66,0.53
annoyance,0.39,0.56,0.46
approval,0.41,0.62,0.5
caring,0.56,0.48,0.52
confusion,0.44,0.61,0.51
curiosity,0.63,0.79,0.7
desire,0.7,0.52,0.6
disappointment,0.34,0.35,0.35


In [76]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.64,0.84,0.72
amusement,0.78,0.89,0.83
anger,0.44,0.68,0.54
annoyance,0.37,0.57,0.45
approval,0.44,0.57,0.5
caring,0.61,0.51,0.56
confusion,0.44,0.53,0.48
curiosity,0.61,0.78,0.68
desire,0.6,0.62,0.61
disappointment,0.36,0.42,0.39


In [77]:
y_pred_labels_test_GE_no_neu_opt.shape, y_pred_labels_val_GE_no_neu_opt.shape

((3821, 27), (3834, 27))

In [78]:
# Number of predictions with no positive label for test
sum(np.sum(y_pred_labels_test_GE_no_neu_opt, axis=1)==0)

32

In [79]:
# Number of predictions with no positive label for val
sum(np.sum(y_pred_labels_val_GE_no_neu_opt, axis=1)==0)

30

In [80]:
# Handling empty predictions for test
y_pred_labels_test_GE_no_neu_opt_h = np.copy(y_pred_labels_test_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_test_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_test_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt_h, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.63,0.84,0.72
amusement,0.8,0.93,0.86
anger,0.45,0.66,0.53
annoyance,0.39,0.56,0.46
approval,0.41,0.62,0.5
caring,0.56,0.48,0.52
confusion,0.44,0.61,0.51
curiosity,0.63,0.79,0.7
desire,0.7,0.52,0.6
disappointment,0.34,0.35,0.35


In [81]:
# Handling empty predictions for val
y_pred_labels_val_GE_no_neu_opt_h = np.copy(y_pred_labels_val_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_val_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_val_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt_h, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.61,0.84,0.7
amusement,0.78,0.89,0.83
anger,0.44,0.68,0.54
annoyance,0.37,0.57,0.45
approval,0.44,0.57,0.5
caring,0.61,0.51,0.56
confusion,0.44,0.53,0.48
curiosity,0.61,0.78,0.68
desire,0.6,0.62,0.61
disappointment,0.36,0.42,0.39


### Saving the Model

In [82]:
PATH = "BERT_GoEmotion_no_neu_8.pt"
torch.save(model_GE_no_neu.state_dict(), PATH)


#### Loading the model

In [83]:
PATH = "BERT_GoEmotion_no_neu_8.pt"
model_GE_no_neu_2 = BERTClass()
model_GE_no_neu_2.load_state_dict(torch.load(PATH))
model_GE_no_neu_2.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

###  Model 9

In [84]:
MAX_LEN = max_length
TRAIN_BATCH_SIZE = 32
VALIDATION_BATCH_SIZE = 32
TEST_BATCH_SIZE =32
EPOCHS = 6
LEARNING_RATE = 3e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

### Dataset & Data Loaders

In [85]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe[GE_taxonomy_no_neu]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            verbose = True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [86]:
train_dataset = CustomDataset(
  df_train_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

validation_dataset = CustomDataset(
  df_val_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

test_dataset = CustomDataset(
  df_test_GE_no_neu,
  tokenizer,
  max_len=MAX_LEN
)

In [87]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }
validation_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False
                }

train_loader = DataLoader(train_dataset, **train_params)
validation_loader = DataLoader(validation_dataset, **validation_params)
test_loader = DataLoader(test_dataset, **test_params)

In [88]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Model

In [89]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 27)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output)
        output = self.l3(output)
        return output

model_GE_no_neu = BERTClass()
model_GE_no_neu.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [90]:
import random
import time

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params =  model_GE_no_neu.parameters(), lr=LEARNING_RATE)

In [91]:
def train(epoch,model):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')
        
        # Update parameters and the learning rate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

!pip install GPUtil

### Start Training

In [92]:
for epoch in range(EPOCHS):
    train(epoch, model_GE_no_neu)

Epoch: 1, Loss:  0.7213438749313354
Epoch: 2, Loss:  0.08937067538499832
Epoch: 3, Loss:  0.09465490281581879
Epoch: 4, Loss:  0.06225880980491638
Epoch: 5, Loss:  0.035748519003391266
Epoch: 6, Loss:  0.020578546449542046


In [93]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [94]:
def validation(loader,model):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [95]:
# train results
#for epoch in range(EPOCHS):
y_pred_proba_train_GE_no_neu, targets_train = validation(train_loader,model_GE_no_neu) # epoch
y_pred_train_GE_no_neu = np.array(y_pred_proba_train_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_train, y_pred_train_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_train, y_pred_train_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.8287834701016772
F1 Score (Micro) = 0.9073951076127355
F1 Score (Macro) = 0.8238991968792708


In [96]:
cr = classification_report(targets_train,y_pred_train_GE_no_neu,target_names= GE_taxonomy_no_neu)
print(cr)

                precision    recall  f1-score   support

    admiration       0.98      0.93      0.95      4130
     amusement       0.98      0.92      0.95      2328
         anger       0.83      0.95      0.88      1567
     annoyance       0.90      0.82      0.86      2470
      approval       0.98      0.83      0.90      2939
        caring       0.97      0.84      0.90      1087
     confusion       0.99      0.78      0.87      1368
     curiosity       0.94      0.93      0.93      2191
        desire       0.94      0.87      0.90       641
disappointment       0.96      0.72      0.82      1269
   disapproval       0.98      0.86      0.92      2022
       disgust       0.98      0.76      0.86       793
 embarrassment       0.97      0.77      0.86       303
    excitement       0.88      0.85      0.87       853
          fear       0.96      0.92      0.94       596
     gratitude       0.98      0.96      0.97      2662
         grief       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#Model evaluation
model_eval(targets_train, y_pred_train_GE_no_neu, GE_taxonomy_no_neu)

In [97]:
# test results
#for epoch in range(EPOCHS):
y_pred_proba_test_GE_no_neu, targets_test = validation(test_loader,model_GE_no_neu) # epoch
y_pred_test_GE_no_neu = np.array(y_pred_proba_test_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_test, y_pred_test_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_test, y_pred_test_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.4666317717874902
F1 Score (Micro) = 0.5962307252998287
F1 Score (Macro) = 0.4961137856201312


In [98]:
# Model evaluation
model_eval(y_test_no_neu, y_pred_test_GE_no_neu, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.72,0.7,0.71
amusement,0.83,0.8,0.81
anger,0.5,0.6,0.54
annoyance,0.37,0.51,0.42
approval,0.56,0.45,0.5
caring,0.58,0.39,0.47
confusion,0.56,0.39,0.46
curiosity,0.65,0.69,0.67
desire,0.69,0.43,0.53
disappointment,0.38,0.24,0.29


In [99]:
# validation results
#for epoch in range(EPOCHS):
y_pred_proba_val_GE_no_neu, targets_val = validation(validation_loader,model_GE_no_neu) # epoch
y_pred_val_GE_no_neu = np.array(y_pred_proba_val_GE_no_neu) >= 0.5
accuracy = metrics.accuracy_score(targets_val, y_pred_val_GE_no_neu)
f1_score_micro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='micro')
f1_score_macro = metrics.f1_score(targets_val, y_pred_val_GE_no_neu, average='macro')

print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.4692227438706312
F1 Score (Micro) = 0.5964793500338523
F1 Score (Macro) = 0.5067242094914671


In [100]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_val_GE_no_neu, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.72,0.73,0.73
amusement,0.85,0.8,0.82
anger,0.5,0.61,0.55
annoyance,0.33,0.47,0.39
approval,0.6,0.37,0.45
caring,0.61,0.41,0.49
confusion,0.64,0.36,0.46
curiosity,0.68,0.71,0.7
desire,0.58,0.58,0.58
disappointment,0.41,0.29,0.34


In [101]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_labels .shape[0]):
        for j in range(y_pred_labels .shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [102]:
# Generate labels for train, test and val
y_pred_train_labels_no_neu = proba_to_labels(y_pred_proba_train_GE_no_neu)

y_pred_test_labels_no_neu = proba_to_labels(y_pred_proba_test_GE_no_neu)

y_pred_val_labels_no_neu = proba_to_labels(y_pred_proba_val_GE_no_neu)

y_pred_labels = np.zeros_like(y_pred_proba)
y_pred_labels.shape[0]

In [103]:
# Function that computes labels from probabilities and optimizes the threshold that maximizes f1-score
def proba_to_labels_opt(y_true, y_pred_proba):
    
    '''
    Inputs:
        y_true: Ground truth labels
        y_pred_proba: predicted probabilities
        
    Outputs :
        best_y_pred_labels: preticted labels associated with best threshold
        best_t: best threshold
        best_macro_f1: macro f1-score associated with predicted labels
    '''
    
    # range of possible thresholds
    thresholds = np.arange(0.1, 0.99, 0.01)
    
    # Computing threshold that maximizes macro f1-score 
    best_y_pred_labels = np.zeros_like(y_pred_proba)
    best_t = 0
    best_macro_f1 = 0
    
    # Iterating through possible thresholds
    for t in thresholds:
        
        y_pred_labels = proba_to_labels(y_pred_proba, t)
                             
        _, _, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
        
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_t = t
            best_y_pred_labels = y_pred_labels
            
    return best_y_pred_labels, best_t, best_macro_f1

In [104]:
##### Compute label predictions and corresponding optimal thresholds 
y_pred_labels_test_GE_no_neu_opt, threshold_test_GE_no_neu_opt, macro_f1_test_GE_no_neu_opt = proba_to_labels_opt(y_test_no_neu, y_pred_proba_test_GE_no_neu)
print("The model's threshold is {}".format(threshold_test_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_test_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.16999999999999998
The model's best macro-f1 is 0.5259821744569931


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [105]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_val_GE_no_neu_opt, threshold_val_GE_no_neu_opt, macro_f1_val_GE_no_neu_opt = proba_to_labels_opt(y_val_no_neu, y_pred_proba_val_GE_no_neu)
print("The model's threshold is {}".format(threshold_val_GE_no_neu_opt))
print("The model's best macro-f1 is {}".format(macro_f1_val_GE_no_neu_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.29999999999999993
The model's best macro-f1 is 0.5303344745924851


  _warn_prf(average, modifier, msg_start, len(result))


In [106]:
#####  Model evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.66,0.77,0.71
amusement,0.77,0.91,0.83
anger,0.42,0.71,0.52
annoyance,0.32,0.71,0.44
approval,0.47,0.56,0.51
caring,0.46,0.53,0.49
confusion,0.46,0.55,0.5
curiosity,0.59,0.79,0.68
desire,0.51,0.48,0.5
disappointment,0.29,0.41,0.34


In [107]:
# Model evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.69,0.77,0.73
amusement,0.82,0.84,0.83
anger,0.46,0.68,0.55
annoyance,0.31,0.57,0.4
approval,0.54,0.42,0.47
caring,0.57,0.48,0.52
confusion,0.53,0.43,0.48
curiosity,0.63,0.76,0.69
desire,0.53,0.64,0.58
disappointment,0.37,0.36,0.37


In [108]:
y_pred_labels_test_GE_no_neu_opt.shape, y_pred_labels_val_GE_no_neu_opt.shape

((3821, 27), (3834, 27))

In [109]:
# Number of predictions with no positive label for test
sum(np.sum(y_pred_labels_test_GE_no_neu_opt, axis=1)==0)

4

In [110]:
# Number of predictions with no positive label for val
sum(np.sum(y_pred_labels_val_GE_no_neu_opt, axis=1)==0)

32

In [111]:
# Handling empty predictions for test
y_pred_labels_test_GE_no_neu_opt_h = np.copy(y_pred_labels_test_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_test_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_test_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_test_no_neu, y_pred_labels_test_GE_no_neu_opt_h, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.65,0.78,0.71
amusement,0.77,0.91,0.83
anger,0.42,0.71,0.52
annoyance,0.32,0.71,0.44
approval,0.47,0.56,0.51
caring,0.46,0.53,0.49
confusion,0.46,0.55,0.5
curiosity,0.59,0.79,0.68
desire,0.51,0.48,0.5
disappointment,0.29,0.41,0.34


In [112]:
# Handling empty predictions for val
y_pred_labels_val_GE_no_neu_opt_h = np.copy(y_pred_labels_val_GE_no_neu_opt)

# if no predictions ==> label with highest proba
for i, pred in enumerate(y_pred_labels_val_GE_no_neu_opt_h):
    if pred.sum()==0:
        pred[np.argmax(y_pred_labels_val_GE_no_neu_opt_h[i])]=1

# Evaluation
model_eval(y_val_no_neu, y_pred_labels_val_GE_no_neu_opt_h, GE_taxonomy_no_neu)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
admiration,0.66,0.78,0.71
amusement,0.82,0.84,0.83
anger,0.46,0.68,0.55
annoyance,0.31,0.57,0.4
approval,0.54,0.42,0.47
caring,0.57,0.48,0.52
confusion,0.53,0.43,0.48
curiosity,0.63,0.76,0.69
desire,0.53,0.64,0.58
disappointment,0.37,0.36,0.37


### Saving the Model

In [113]:
PATH = "BERT_GoEmotion_no_neu_9.pt"
torch.save(model_GE_no_neu.state_dict(), PATH)


#### Loading the model

In [114]:
PATH = "BERT_GoEmotion_no_neu_9.pt"
model_GE_no_neu_2 = BERTClass()
model_GE_no_neu_2.load_state_dict(torch.load(PATH))
model_GE_no_neu_2.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

### 2.4.4 - Indirect evaluation on Ekman taxonomy by mapping predictions

Until now, we have only evaluated our model on the GoEmotions taxonomy.

As a reference, we can try to map the true and predicted emotions to the Ekman taxonomy and see how our model performs.

We have already defined the Ekman taxonomy earlier.

Let's define a function that transforms labels from GoEmotions to Ekman taxonomy.

In [26]:
# Function thats maps predictions on GoEmotions taxonomy to Ekman taxonomy
def GE_to_Ekman(GE_labels):
    
    # Create a dataframe of GoEmotions labels
    df_GE = pd.DataFrame(GE_labels, columns=GE_taxonomy)

    # Create an empty dataframe of Ekman labels
    df_Ekman  = pd.DataFrame(np.zeros((len(GE_labels), len(Ekman_taxonomy))), columns=Ekman_taxonomy)

    for i in range(len(df_GE)):

        if df_GE.loc[i,['anger', 'annoyance', 'disapproval']].sum() >= 1:
            df_Ekman.loc[i,'anger'] = 1

        if df_GE.loc[i,'disgust'].sum() >= 1:
            df_Ekman.loc[i,'disgust'] = 1

        if df_GE.loc[i,['fear', 'nervousness']].sum() >= 1:
            df_Ekman.loc[i,'fear'] = 1

        if df_GE.loc[i,['joy', 'amusement', 'approval', 'excitement', 'gratitude',
                        'love', 'optimism', 'relief', 'pride', 'admiration', 'desire','caring']].sum() >= 1:
            df_Ekman.loc[i,'joy'] = 1 

        if df_GE.loc[i,'neutral'].sum() >= 1:
            df_Ekman.loc[i,'neutral'] = 1

        if df_GE.loc[i,['sadness', 'disappointment', 'embarrassment', 'grief', 'remorse']].sum() >= 1:
            df_Ekman.loc[i,'sadness'] = 1

        if df_GE.loc[i,['surprise', 'realization', 'confusion', 'curiosity']].sum() >= 1:
            df_Ekman.loc[i,'surprise'] = 1

    return df_Ekman.values

We can now apply our function and evaluate the predictions

In [27]:
# Mapping GoEmotion labels to Ekman labels (true and predictions)
y_test_Ekman = GE_to_Ekman(y_test)
y_pred_labels_Ekman = GE_to_Ekman(y_pred_labels_opt_n)

# Evaluation
model_eval(y_test_Ekman, y_pred_labels_Ekman, Ekman_taxonomy)

Unnamed: 0,Precision,Recall,F1
anger,0.6,0.41,0.49
disgust,0.3,0.74,0.43
fear,0.47,0.79,0.59
joy,0.78,0.82,0.8
sadness,0.4,0.6,0.48
surprise,0.47,0.7,0.56
neutral,0.68,0.47,0.56
MACRO-AVERAGE,0.53,0.65,0.56


Our model obtained a **reasonable score** on the Ekman taxonomy. However, we expected more when switching from 28 emotions to only 7 emotions.
