In [1]:
import os
import matplotlib.pyplot as plt
import re
import string
import json
import emoji
import numpy as np
import pandas as pd
from sklearn import metrics
from bs4 import BeautifulSoup
import tqdm
import ktrain
from ktrain import text
import tensorflow as tf
import timeit
import transformers
import contractions
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW, TFAutoModelForSequenceClassification
import gc
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

In [2]:
df_train = pd.read_csv('https://github.com/google-research/google-research/raw/master/goemotions/data/train.tsv', sep='\t', header=None, names=['Text', 'Class', 'ID']).drop('ID', axis=1)
df_val = pd.read_csv('https://github.com/google-research/google-research/raw/master/goemotions/data/dev.tsv', sep='\t', header=None, names=['Text', 'Class', 'ID']).drop('ID', axis=1)
df_test = pd.read_csv('https://github.com/google-research/google-research/raw/master/goemotions/data/test.tsv', sep='\t', header=None, names=['Text', 'Class', 'ID']).drop('ID', axis=1)


In [3]:
df_train

Unnamed: 0,Text,Class
0,My favourite food is anything I didn't have to...,27
1,"Now if he does off himself, everyone will thin...",27
2,WHY THE FUCK IS BAYLESS ISOING,2
3,To make her feel threatened,14
4,Dirty Southern Wankers,3
...,...,...
43405,Added you mate well I’ve just got the bow and ...,18
43406,Always thought that was funny but is it a refe...,6
43407,What are you talking about? Anything bad that ...,3
43408,"More like a baptism, with sexy results!",13


In [4]:
df_train['List of classes'] = df_train['Class'].apply(lambda x: x.split(','))
df_train['Len of classes'] = df_train['List of classes'].apply(lambda x: len(x))

df_val['List of classes'] = df_val['Class'].apply(lambda x: x.split(','))
df_val['Len of classes'] = df_val['List of classes'].apply(lambda x: len(x))

df_test['List of classes'] = df_test['Class'].apply(lambda x: x.split(','))
df_test['Len of classes'] = df_test['List of classes'].apply(lambda x: len(x))

In [5]:
df_train

Unnamed: 0,Text,Class,List of classes,Len of classes
0,My favourite food is anything I didn't have to...,27,[27],1
1,"Now if he does off himself, everyone will thin...",27,[27],1
2,WHY THE FUCK IS BAYLESS ISOING,2,[2],1
3,To make her feel threatened,14,[14],1
4,Dirty Southern Wankers,3,[3],1
...,...,...,...,...
43405,Added you mate well I’ve just got the bow and ...,18,[18],1
43406,Always thought that was funny but is it a refe...,6,[6],1
43407,What are you talking about? Anything bad that ...,3,[3],1
43408,"More like a baptism, with sexy results!",13,[13],1


In [6]:
df_train.isnull().sum()

Text               0
Class              0
List of classes    0
Len of classes     0
dtype: int64

In [7]:
df_train["Class"].value_counts()

27           12823
0             2710
4             1873
15            1857
1             1652
             ...  
6,15,22          1
9,10,19          1
7,10,25          1
7,9,24,25        1
0,1,18           1
Name: Class, Length: 711, dtype: int64

In [8]:
with open('ekman_mapping.json') as file:
    ekman_mapping = json.load(file)

In [9]:
emotion_file = open("emotions.txt", "r")
emotion_list = emotion_file.read()
emotion_list = emotion_list.split("\n")
print(emotion_list)

['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [10]:
def idx2class(idx_list):
    arr = []
    for i in idx_list:
        arr.append(emotion_list[int(i)])
    return arr

In [11]:
df_train['Emotions'] = df_train['List of classes'].apply(idx2class)
df_val['Emotions'] = df_val['List of classes'].apply(idx2class)
df_test['Emotions'] = df_test['List of classes'].apply(idx2class)

In [12]:
ekman_mapping

{'anger': ['anger', 'annoyance', 'disapproval'],
 'disgust': ['disgust'],
 'fear': ['fear', 'nervousness'],
 'joy': ['joy',
  'amusement',
  'approval',
  'excitement',
  'gratitude',
  'love',
  'optimism',
  'relief',
  'pride',
  'admiration',
  'desire',
  'caring'],
 'sadness': ['sadness', 'disappointment', 'embarrassment', 'grief', 'remorse'],
 'surprise': ['surprise', 'realization', 'confusion', 'curiosity']}

In [13]:
def EmotionMapping(emotion_list):
    map_list = []
    
    for i in emotion_list:
        if i in ekman_mapping['anger']:
            map_list.append('anger')
        if i in ekman_mapping['disgust']:
            map_list.append('disgust')
        if i in ekman_mapping['fear']:
            map_list.append('fear')
        if i in ekman_mapping['joy']:
            map_list.append('joy')
        if i in ekman_mapping['sadness']:
            map_list.append('sadness')
        if i in ekman_mapping['surprise']:
            map_list.append('surprise')
        if i == 'neutral':
            map_list.append('neutral')
            
    return map_list

In [14]:
df_train['Mapped Emotions'] = df_train['Emotions'].apply(EmotionMapping)
df_val['Mapped Emotions'] = df_val['Emotions'].apply(EmotionMapping)
df_test['Mapped Emotions'] = df_test['Emotions'].apply(EmotionMapping)

In [15]:
df_train

Unnamed: 0,Text,Class,List of classes,Len of classes,Emotions,Mapped Emotions
0,My favourite food is anything I didn't have to...,27,[27],1,[neutral],[neutral]
1,"Now if he does off himself, everyone will thin...",27,[27],1,[neutral],[neutral]
2,WHY THE FUCK IS BAYLESS ISOING,2,[2],1,[anger],[anger]
3,To make her feel threatened,14,[14],1,[fear],[fear]
4,Dirty Southern Wankers,3,[3],1,[annoyance],[anger]
...,...,...,...,...,...,...
43405,Added you mate well I’ve just got the bow and ...,18,[18],1,[love],[joy]
43406,Always thought that was funny but is it a refe...,6,[6],1,[confusion],[surprise]
43407,What are you talking about? Anything bad that ...,3,[3],1,[annoyance],[anger]
43408,"More like a baptism, with sexy results!",13,[13],1,[excitement],[joy]


In [16]:
# Building a preprocessing function to clean text
def preprocess_corpus(x):

  # Adding a space between words and punctation
  x = re.sub( r'([a-zA-Z\[\]])([,;.!?])', r'\1 \2', x)
  x = re.sub( r'([,;.!?])([a-zA-Z\[\]])', r'\1 \2', x)
  
  # Demojize
  x = emoji.demojize(x)
  
  # Expand contraction
  x = contractions.fix(x)
  
  # Lower
  x = x.lower()

  #correct some acronyms/typos/abbreviations  
  x = re.sub(r"lmao", "laughing my ass off", x)  
  x = re.sub(r"amirite", "am i right", x)
  x = re.sub(r"\b(tho)\b", "though", x)
  x = re.sub(r"\b(ikr)\b", "i know right", x)
  x = re.sub(r"\b(ya|u)\b", "you", x)
  x = re.sub(r"\b(eu)\b", "europe", x)
  x = re.sub(r"\b(da)\b", "the", x)
  x = re.sub(r"\b(dat)\b", "that", x)
  x = re.sub(r"\b(dats)\b", "that is", x)
  x = re.sub(r"\b(cuz)\b", "because", x)
  x = re.sub(r"\b(fkn)\b", "fucking", x)
  x = re.sub(r"\b(tbh)\b", "to be honest", x)
  x = re.sub(r"\b(tbf)\b", "to be fair", x)
  x = re.sub(r"faux pas", "mistake", x)
  x = re.sub(r"\b(btw)\b", "by the way", x)
  x = re.sub(r"\b(bs)\b", "bullshit", x)
  x = re.sub(r"\b(kinda)\b", "kind of", x)
  x = re.sub(r"\b(bruh)\b", "bro", x)
  x = re.sub(r"\b(w/e)\b", "whatever", x)
  x = re.sub(r"\b(w/)\b", "with", x)
  x = re.sub(r"\b(w/o)\b", "without", x)
  x = re.sub(r"\b(doj)\b", "department of justice", x)
  
  #replace some words with multiple occurences of a letter, example "coooool" turns into --> cool
  x = re.sub(r"\b(j+e{2,}z+e*)\b", "jeez", x)
  x = re.sub(r"\b(co+l+)\b", "cool", x)
  x = re.sub(r"\b(g+o+a+l+)\b", "goal", x)
  x = re.sub(r"\b(s+h+i+t+)\b", "shit", x)
  x = re.sub(r"\b(o+m+g+)\b", "omg", x)
  x = re.sub(r"\b(w+t+f+)\b", "wtf", x)
  x = re.sub(r"\b(w+h+a+t+)\b", "what", x)
  x = re.sub(r"\b(y+e+y+|y+a+y+|y+e+a+h+)\b", "yeah", x)
  x = re.sub(r"\b(w+o+w+)\b", "wow", x)
  x = re.sub(r"\b(w+h+y+)\b", "why", x)
  x = re.sub(r"\b(s+o+)\b", "so", x)
  x = re.sub(r"\b(f)\b", "fuck", x)
  x = re.sub(r"\b(w+h+o+p+s+)\b", "whoops", x)
  x = re.sub(r"\b(ofc)\b", "of course", x)
  x = re.sub(r"\b(the us)\b", "usa", x)
  x = re.sub(r"\b(gf)\b", "girlfriend", x)
  x = re.sub(r"\b(hr)\b", "human ressources", x)
  x = re.sub(r"\b(mh)\b", "mental health", x)
  x = re.sub(r"\b(idk)\b", "i do not know", x)
  x = re.sub(r"\b(gotcha)\b", "i got you", x)
  x = re.sub(r"\b(y+e+p+)\b", "yes", x)
  x = re.sub(r"\b(a*ha+h[ha]*|a*ha +h[ha]*)\b", "haha", x)
  x = re.sub(r"\b(o?l+o+l+[ol]*)\b", "lol", x)
  x = re.sub(r"\b(o*ho+h[ho]*|o*ho +h[ho]*)\b", "ohoh", x)
  x = re.sub(r"\b(o+h+)\b", "oh", x)
  x = re.sub(r"\b(a+h+)\b", "ah", x)
  x = re.sub(r"\b(u+h+)\b", "uh", x)

  # Handling emojis
  x = re.sub(r"<3", " love ", x)
  x = re.sub(r"xd", " smiling_face_with_open_mouth_and_tightly_closed_eyes ", x)
  x = re.sub(r":\)", " smiling_face ", x)
  x = re.sub(r"^_^", " smiling_face ", x)
  x = re.sub(r"\*_\*", " star_struck ", x)
  x = re.sub(r":\(", " frowning_face ", x)
  x = re.sub(r":\^\(", " frowning_face ", x)
  x = re.sub(r";\(", " frowning_face ", x)
  x = re.sub(r":\/",  " confused_face", x)
  x = re.sub(r";\)",  " wink", x)
  x = re.sub(r">__<",  " unamused ", x)
  x = re.sub(r"\b([xo]+x*)\b", " xoxo ", x)
  x = re.sub(r"\b(n+a+h+)\b", "no", x)

  # Handling special cases of text
  x = re.sub(r"h a m b e r d e r s", "hamberders", x)
  x = re.sub(r"b e n", "ben", x)
  x = re.sub(r"s a t i r e", "satire", x)
  x = re.sub(r"y i k e s", "yikes", x)
  x = re.sub(r"s p o i l e r", "spoiler", x)
  x = re.sub(r"thankyou", "thank you", x)
  x = re.sub(r"a^r^o^o^o^o^o^o^o^n^d", "around", x)

  # Remove special characters and numbers replace by space + remove double space
  x = re.sub(r"\b([.]{3,})"," dots ", x)
  x = re.sub(r"[^A-Za-z!?_]+"," ", x)
  x = re.sub(r"\b([s])\b *","", x)
  x = re.sub(r" +"," ", x)
  x = x.strip()

  return x

In [17]:
# Defining the number of samples in train, validation and test dataset
size_train = df_train.shape[0]
size_val = df_val.shape[0]
size_test = df_test.shape[0]

# Defining the total number of samples
size_all = size_train + size_val + size_test

In [18]:
size_train

43410

In [19]:
# Shape of train, validation and test datasets
print("Train dataset has {} samples and represents {:.2f}% of overall data".format(size_train, size_train/size_all*100))
print("Validation dataset has {} samples and represents {:.2f}% of overall data".format(size_val, size_val/size_all*100))
print("Test dataset has {} samples and represents {:.2f}% of overall data".format(size_test, size_test/size_all*100))
print()
print("The total number of samples is : {}".format(size_all))

Train dataset has 43410 samples and represents 80.00% of overall data
Validation dataset has 5426 samples and represents 10.00% of overall data
Test dataset has 5427 samples and represents 10.00% of overall data

The total number of samples is : 54263


In [20]:
# Concatenating the 3 datasets for labels preprocessing
df_all = pd.concat([df_train, df_val, df_test], axis=0).reset_index(drop=True)

# Preview of data
display(df_all.head(5))

print(df_all.shape)

Unnamed: 0,Text,Class,List of classes,Len of classes,Emotions,Mapped Emotions
0,My favourite food is anything I didn't have to...,27,[27],1,[neutral],[neutral]
1,"Now if he does off himself, everyone will thin...",27,[27],1,[neutral],[neutral]
2,WHY THE FUCK IS BAYLESS ISOING,2,[2],1,[anger],[anger]
3,To make her feel threatened,14,[14],1,[fear],[fear]
4,Dirty Southern Wankers,3,[3],1,[annoyance],[anger]


(54263, 6)


In [21]:
# Applying the preprocessing function on the dataset
df_all["Clean_text"] = df_all["Text"].apply(preprocess_corpus)

# Preview of data
display(df_all[['Text', 'Clean_text']].sample(5))

Unnamed: 0,Text,Clean_text
28254,"I just want to note that ""Live Photo"" is somet...",i just want to note that live photo is somethi...
32749,"Okay, carry on then. Just thought I’d make sur...",okay carry on then just thought i would make s...
34981,Thank you for an informative answer,thank you for an informative answer
18338,should have expected that ending ... but nice ...,should have expected that ending but nice new ...
37447,That is a very handsome dog.,that is a very handsome dog


In [22]:
df_all

Unnamed: 0,Text,Class,List of classes,Len of classes,Emotions,Mapped Emotions,Clean_text
0,My favourite food is anything I didn't have to...,27,[27],1,[neutral],[neutral],my favourite food is anything i did not have t...
1,"Now if he does off himself, everyone will thin...",27,[27],1,[neutral],[neutral],now if he does off himself everyone will think...
2,WHY THE FUCK IS BAYLESS ISOING,2,[2],1,[anger],[anger],why the fuck is bayless isoing
3,To make her feel threatened,14,[14],1,[fear],[fear],to make her feel threatened
4,Dirty Southern Wankers,3,[3],1,[annoyance],[anger],dirty southern wankers
...,...,...,...,...,...,...,...
54258,Thanks. I was diagnosed with BP 1 after the ho...,15,[15],1,[gratitude],[joy],thanks i was diagnosed with bp after the hospi...
54259,Well that makes sense.,4,[4],1,[approval],[joy],well that makes sense
54260,Daddy issues [NAME],27,[27],1,[neutral],[neutral],daddy issues name
54261,So glad I discovered that subreddit a couple m...,0,[0],1,[admiration],[joy],so glad i discovered that subreddit a couple m...


In [23]:
# Keeping only necessary columns
df_all = df_all.drop(['Class','List of classes','Len of classes','Emotions'], axis=1)
df_all.head(3)

Unnamed: 0,Text,Mapped Emotions,Clean_text
0,My favourite food is anything I didn't have to...,[neutral],my favourite food is anything i did not have t...
1,"Now if he does off himself, everyone will thin...",[neutral],now if he does off himself everyone will think...
2,WHY THE FUCK IS BAYLESS ISOING,[anger],why the fuck is bayless isoing


In [24]:
# Dropping raw text column
df_all = df_all[ ['Clean_text','Mapped Emotions'] ]
df_all

Unnamed: 0,Clean_text,Mapped Emotions
0,my favourite food is anything i did not have t...,[neutral]
1,now if he does off himself everyone will think...,[neutral]
2,why the fuck is bayless isoing,[anger]
3,to make her feel threatened,[fear]
4,dirty southern wankers,[anger]
...,...,...
54258,thanks i was diagnosed with bp after the hospi...,[joy]
54259,well that makes sense,[joy]
54260,daddy issues name,[neutral]
54261,so glad i discovered that subreddit a couple m...,[joy]


In [25]:
emotion_dict={
"anger": 0,
"disgust": 1,
"fear": 2,
"joy": 3,
"sadness": 4,
"surprise": 5,
"neutral":6
}

In [26]:
# Defining a function that maps each emotion lables to index
def class2idx(emotion_lst):
    for e in emotion_lst:
        ind = emotion_dict[e]
    return ind

# Applying the function
df_all['Mapped_id'] = df_all['Mapped Emotions'].apply(class2idx)

# Preview of data
display(df_all.head(3))

Unnamed: 0,Clean_text,Mapped Emotions,Mapped_id
0,my favourite food is anything i did not have t...,[neutral],6
1,now if he does off himself everyone will think...,[neutral],6
2,why the fuck is bayless isoing,[anger],0


In [27]:
# Dropping Mapped Emotions column
df_all = df_all.drop(['Mapped Emotions'], axis=1)
df_all.head(3)

Unnamed: 0,Clean_text,Mapped_id
0,my favourite food is anything i did not have t...,6
1,now if he does off himself everyone will think...,6
2,why the fuck is bayless isoing,0


In [28]:
# Building a function that will divide in train, validation and test sets
def get_train_val_test(df):
    train = df.iloc[:size_train, :]
    val = df.iloc[size_train:size_train+size_val, :]
    test = df.iloc[size_train+size_val:size_train+size_val+size_test, :]
    return train, val, test

In [29]:
# Dividing back in train, validation and test datasets (GoEmotions)
train_GE, val_GE, test_GE = get_train_val_test(df_all)
print(train_GE.shape)
print(val_GE.shape)
print(test_GE.shape)

(43410, 2)
(5426, 2)
(5427, 2)


In [30]:
train_GE.head(3)

Unnamed: 0,Clean_text,Mapped_id
0,my favourite food is anything i did not have t...,6
1,now if he does off himself everyone will think...,6
2,why the fuck is bayless isoing,0


In [31]:
train_GE_no_neu = train_GE.copy()
val_GE_no_neu = val_GE.copy()
test_GE_no_neu = test_GE.copy()

In [32]:
train_GE_no_neu = train_GE_no_neu[train_GE_no_neu['Mapped_id']!=6]
val_GE_no_neu = val_GE_no_neu[val_GE_no_neu['Mapped_id']!=6]
test_GE_no_neu = test_GE_no_neu[test_GE_no_neu['Mapped_id']!=6]

In [33]:
train_GE_no_neu.reset_index(inplace = True)
val_GE_no_neu.reset_index(inplace = True)
test_GE_no_neu.reset_index(inplace = True)

In [34]:
train_GE_no_neu.shape, val_GE_no_neu.shape, test_GE_no_neu.shape

((29191, 3), (3660, 3), (3640, 3))

In [35]:
class_label_names_no_neu = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

In [36]:
class_label_names = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'neutral']

In [37]:
X_train = train_GE[:]["Clean_text"]
y_train = train_GE[:]["Mapped_id"]
X_train_no_neu = train_GE_no_neu[:]["Clean_text"]
y_train_no_neu = train_GE_no_neu[:]['Mapped_id']
X_val = val_GE[:]["Clean_text"]
y_val = val_GE[:]["Mapped_id"]
X_val_no_neu = val_GE_no_neu[:]["Clean_text"]
y_val_no_neu = val_GE_no_neu[:]['Mapped_id']
X_test = test_GE[:]["Clean_text"]
y_test = test_GE[:]["Mapped_id"]
X_test_no_neu = test_GE_no_neu[:]["Clean_text"]
y_test_no_neu = test_GE_no_neu[:]['Mapped_id']
print(X_train.shape, y_train.shape,y_train_no_neu.shape, X_val.shape, y_val.shape,y_val_no_neu.shape, X_test.shape, y_test.shape, y_test_no_neu.shape)

(43410,) (43410,) (29191,) (5426,) (5426,) (3660,) (5427,) (5427,) (3640,)


## Modelling ELECTRA with EKMAN taxonomy including neutral emotion

### Model 1

In [58]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [59]:
X_train.values

array(['my favourite food is anything i did not have to cook myself',
       'now if he does off himself everyone will think hes having a laugh screwing with people instead of actually dead',
       'why the fuck is bayless isoing', ...,
       'what are you talking about ? anything bad that happened was name fault only good things were name doing !',
       'more like a baptism with sexy results !', 'enjoy the ride !'],
      dtype=object)

In [60]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 10
MAX_LENGTH = 56
NUM_LABELS = 7

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train.values,
                                                         y_train.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val.values,
                                                      y_val.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test.values,
                                                      y_test.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 43410/43410 [00:03<00:00, 11039.62it/s]


Data Shapes : 
(43410, 56) (43410, 56) (43410,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 5426/5426 [00:00<00:00, 11822.68it/s]


Data Shapes : 
(5426, 56) (5426, 56) (5426,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 5427/5427 [00:00<00:00, 12040.52it/s]


Data Shapes : 
(5427, 56) (5427, 56) (5427,)


In [61]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [62]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 10
MAX_LENGTH = 56
NUM_LABELS = 7
learning_rate = 3e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [74]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train,y_pred_train_electra)))
print(confusion_matrix(y_train,y_pred_train_electra))

accuracy score: 0.8609
[[ 3618    39     7   204   102   110   437]
 [  140   454    11    25    21    13    30]
 [    5    15   541    22    17    16    26]
 [  107    17    18 14639    99   187   626]
 [  119    29    21   107  2456    62   144]
 [   72     8    18   336    54  3932   287]
 [  457    17    36   957   188   832 11732]]


In [67]:
print(classification_report(y_train, y_pred_train_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.80      0.80      0.80      4517
     disgust       0.78      0.65      0.71       694
        fear       0.83      0.84      0.84       642
         joy       0.90      0.93      0.92     15693
     sadness       0.84      0.84      0.84      2938
    surprise       0.76      0.84      0.80      4707
     neutral       0.88      0.83      0.85     14219

    accuracy                           0.86     43410
   macro avg       0.83      0.82      0.82     43410
weighted avg       0.86      0.86      0.86     43410



In [73]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test,y_pred_test_electra)))
print(confusion_matrix(y_test,y_pred_test_electra))

accuracy score: 0.6473
[[ 263   11    4   71   33   38  175]
 [  28   39    3    9   10    8   15]
 [   1    5   53    7    7    8    6]
 [  31    2    5 1564   36   68  209]
 [  39    2    5   35  189   18   53]
 [  23    4    8   99   12  338  106]
 [ 129   10   14  307   56  204 1067]]


In [70]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.51      0.44      0.47       595
     disgust       0.53      0.35      0.42       112
        fear       0.58      0.61      0.59        87
         joy       0.75      0.82      0.78      1915
     sadness       0.55      0.55      0.55       341
    surprise       0.50      0.57      0.53       590
     neutral       0.65      0.60      0.62      1787

    accuracy                           0.65      5427
   macro avg       0.58      0.56      0.57      5427
weighted avg       0.64      0.65      0.64      5427



In [72]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val,y_pred_val_electra))
print(classification_report(y_val, y_pred_val_electra, target_names=class_label_names) )

[[ 277   16    2   59   25   35  168]
 [  22   28    3   11    3    5    9]
 [   7    3   47   12    5    4   11]
 [  34    1    4 1642   33   61  222]
 [  26    8    4   47  197   16   54]
 [  33    2    5   91   25  295  108]
 [ 162   12   12  323   59  175 1023]]
              precision    recall  f1-score   support

       anger       0.49      0.48      0.48       582
     disgust       0.40      0.35      0.37        81
        fear       0.61      0.53      0.57        89
         joy       0.75      0.82      0.79      1997
     sadness       0.57      0.56      0.56       352
    surprise       0.50      0.53      0.51       559
     neutral       0.64      0.58      0.61      1766

    accuracy                           0.65      5426
   macro avg       0.57      0.55      0.56      5426
weighted avg       0.64      0.65      0.64      5426



### Model 3

In [51]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [52]:
X_train.values

array(['my favourite food is anything i did not have to cook myself',
       'now if he does off himself everyone will think hes having a laugh screwing with people instead of actually dead',
       'why the fuck is bayless isoing', ...,
       'what are you talking about ? anything bad that happened was name fault only good things were name doing !',
       'more like a baptism with sexy results !', 'enjoy the ride !'],
      dtype=object)

In [53]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
MAX_LENGTH = 56

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train.values,
                                                         y_train.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val.values,
                                                      y_val.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test.values,
                                                      y_test.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 43410/43410 [00:03<00:00, 11806.73it/s]


Data Shapes : 
(43410, 56) (43410, 56) (43410,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 5426/5426 [00:00<00:00, 11525.33it/s]


Data Shapes : 
(5426, 56) (5426, 56) (5426,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 5427/5427 [00:00<00:00, 10699.85it/s]


Data Shapes : 
(5427, 56) (5427, 56) (5427,)


In [54]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [55]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 5
MAX_LENGTH = 56
NUM_LABELS = 7
learning_rate = 2e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [56]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train,y_pred_train_electra)))
print(confusion_matrix(y_train,y_pred_train_electra))

accuracy score: 0.7679
[[ 2917    60    14   320   174   149   883]
 [  256   268    33    38    46    17    36]
 [   13    61   462    20    31    24    31]
 [  166    12    26 13889   151   305  1144]
 [  279    27    28   121  2180    86   217]
 [  139     4    27   489    66  3120   862]
 [  774    26    55  1659   247   960 10498]]


In [57]:
print(classification_report(y_train, y_pred_train_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.64      0.65      0.64      4517
     disgust       0.59      0.39      0.47       694
        fear       0.72      0.72      0.72       642
         joy       0.84      0.89      0.86     15693
     sadness       0.75      0.74      0.75      2938
    surprise       0.67      0.66      0.67      4707
     neutral       0.77      0.74      0.75     14219

    accuracy                           0.77     43410
   macro avg       0.71      0.68      0.69     43410
weighted avg       0.77      0.77      0.77     43410



In [58]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test,y_pred_test_electra)))
print(confusion_matrix(y_test,y_pred_test_electra))

accuracy score: 0.6716
[[ 273    8    3   60   25   26  200]
 [  36   38    6   10    9    3   10]
 [   4    3   58    6    3    5    8]
 [  25    4    7 1580   30   55  214]
 [  40    3    5   26  197   25   45]
 [  30    2    7   77   11  316  147]
 [ 126    9    8  275   44  142 1183]]


In [59]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.51      0.46      0.48       595
     disgust       0.57      0.34      0.42       112
        fear       0.62      0.67      0.64        87
         joy       0.78      0.83      0.80      1915
     sadness       0.62      0.58      0.60       341
    surprise       0.55      0.54      0.54       590
     neutral       0.65      0.66      0.66      1787

    accuracy                           0.67      5427
   macro avg       0.61      0.58      0.59      5427
weighted avg       0.67      0.67      0.67      5427



In [60]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val,y_pred_val_electra))
print(classification_report(y_val, y_pred_val_electra, target_names=class_label_names) )

[[ 286    9    0   56   26   20  185]
 [  29   27    1   11    1    3    9]
 [   2    8   48    7    4    5   15]
 [  35    0    3 1667   33   51  208]
 [  24    6    5   32  203   23   59]
 [  29    0    4   81   18  285  142]
 [ 152    0   10  294   41  134 1135]]
              precision    recall  f1-score   support

       anger       0.51      0.49      0.50       582
     disgust       0.54      0.33      0.41        81
        fear       0.68      0.54      0.60        89
         joy       0.78      0.83      0.80      1997
     sadness       0.62      0.58      0.60       352
    surprise       0.55      0.51      0.53       559
     neutral       0.65      0.64      0.65      1766

    accuracy                           0.67      5426
   macro avg       0.62      0.56      0.58      5426
weighted avg       0.67      0.67      0.67      5426



In [61]:
## Saving model
electra_model.save_pretrained('electra-emotion-predictor-ekman-2')

In [63]:
electra_model2 = TFAutoModelForSequenceClassification.from_pretrained('electra-emotion-predictor-ekman-2')

### Model 4

In [64]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [65]:
X_train.values

array(['my favourite food is anything i did not have to cook myself',
       'now if he does off himself everyone will think hes having a laugh screwing with people instead of actually dead',
       'why the fuck is bayless isoing', ...,
       'what are you talking about ? anything bad that happened was name fault only good things were name doing !',
       'more like a baptism with sexy results !', 'enjoy the ride !'],
      dtype=object)

In [66]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
MAX_LENGTH = 56

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train.values,
                                                         y_train.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val.values,
                                                      y_val.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test.values,
                                                      y_test.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 43410/43410 [00:03<00:00, 11388.51it/s]


Data Shapes : 
(43410, 56) (43410, 56) (43410,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 5426/5426 [00:00<00:00, 11390.17it/s]


Data Shapes : 
(5426, 56) (5426, 56) (5426,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 5427/5427 [00:00<00:00, 11672.18it/s]


Data Shapes : 
(5427, 56) (5427, 56) (5427,)


In [67]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [68]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 10
MAX_LENGTH = 56
NUM_LABELS = 7
learning_rate = 1e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [69]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train,y_pred_train_electra)))
print(confusion_matrix(y_train,y_pred_train_electra))

accuracy score: 0.7764
[[ 3091    38    41   278   150   195   724]
 [  319   208    45    27    52    14    29]
 [   18    14   500    17    43    26    24]
 [  136     9    33 14009   178   358   970]
 [  231    16    54   128  2205   127   177]
 [   99     3    37   492    83  3482   511]
 [  798    10    61  1563   254  1323 10210]]


In [70]:
print(classification_report(y_train, y_pred_train_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.66      0.68      0.67      4517
     disgust       0.70      0.30      0.42       694
        fear       0.65      0.78      0.71       642
         joy       0.85      0.89      0.87     15693
     sadness       0.74      0.75      0.75      2938
    surprise       0.63      0.74      0.68      4707
     neutral       0.81      0.72      0.76     14219

    accuracy                           0.78     43410
   macro avg       0.72      0.69      0.69     43410
weighted avg       0.78      0.78      0.77     43410



In [71]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test,y_pred_test_electra)))
print(confusion_matrix(y_test,y_pred_test_electra))

accuracy score: 0.6576
[[ 278    6    4   65   32   37  173]
 [  48   26    6    8    9    5   10]
 [   3    3   60    5    5    5    6]
 [  30    1    9 1583   34   63  195]
 [  40    4    4   28  207   23   35]
 [  28    2    8   81   11  360  100]
 [ 147    7   13  312   57  196 1055]]


In [72]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.48      0.47      0.48       595
     disgust       0.53      0.23      0.32       112
        fear       0.58      0.69      0.63        87
         joy       0.76      0.83      0.79      1915
     sadness       0.58      0.61      0.59       341
    surprise       0.52      0.61      0.56       590
     neutral       0.67      0.59      0.63      1787

    accuracy                           0.66      5427
   macro avg       0.59      0.57      0.57      5427
weighted avg       0.66      0.66      0.65      5427



In [73]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val,y_pred_val_electra))
print(classification_report(y_val, y_pred_val_electra, target_names=class_label_names) )

[[ 301    5    3   57   20   30  166]
 [  33   18    6    9    3    3    9]
 [   4    3   55    6    5    6   10]
 [  28    3    3 1656   34   61  212]
 [  27    4    5   32  213   15   56]
 [  35    0    7   83   20  314  100]
 [ 154    0   13  296   51  183 1069]]
              precision    recall  f1-score   support

       anger       0.52      0.52      0.52       582
     disgust       0.55      0.22      0.32        81
        fear       0.60      0.62      0.61        89
         joy       0.77      0.83      0.80      1997
     sadness       0.62      0.61      0.61       352
    surprise       0.51      0.56      0.54       559
     neutral       0.66      0.61      0.63      1766

    accuracy                           0.67      5426
   macro avg       0.60      0.57      0.57      5426
weighted avg       0.67      0.67      0.67      5426



In [74]:
## Saving model
electra_model.save_pretrained('electra-emotion-predictor-ekman-3')

In [75]:
electra_model3 = TFAutoModelForSequenceClassification.from_pretrained('electra-emotion-predictor-ekman-3')

### Model 2

In [38]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [39]:
X_train.values

array(['my favourite food is anything i did not have to cook myself',
       'now if he does off himself everyone will think hes having a laugh screwing with people instead of actually dead',
       'why the fuck is bayless isoing', ...,
       'what are you talking about ? anything bad that happened was name fault only good things were name doing !',
       'more like a baptism with sexy results !', 'enjoy the ride !'],
      dtype=object)

In [40]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 5
MAX_LENGTH = 56
NUM_LABELS = 7

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train.values,
                                                         y_train.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val.values,
                                                      y_val.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test.values,
                                                      y_test.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 43410/43410 [00:03<00:00, 11617.37it/s]


Data Shapes : 
(43410, 56) (43410, 56) (43410,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 5426/5426 [00:00<00:00, 11900.02it/s]


Data Shapes : 
(5426, 56) (5426, 56) (5426,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 5427/5427 [00:00<00:00, 11785.04it/s]

Data Shapes : 
(5427, 56) (5427, 56) (5427,)





In [41]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [42]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 5
MAX_LENGTH = 56
NUM_LABELS = 7
learning_rate = 5e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [43]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train,y_pred_train_electra)))
print(confusion_matrix(y_train,y_pred_train_electra))

accuracy score: 0.8136
[[ 3548    76    19   252    93   113   416]
 [  183   379    38    21    24    13    36]
 [   13    17   516    17    21    27    31]
 [  206    19    25 14583    56   179   625]
 [  240    26    33   274  2128    82   155]
 [  161     3    15   491    49  3523   465]
 [  956    31    53  1502   174   860 10643]]


In [44]:
print(classification_report(y_train, y_pred_train_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.67      0.79      0.72      4517
     disgust       0.69      0.55      0.61       694
        fear       0.74      0.80      0.77       642
         joy       0.85      0.93      0.89     15693
     sadness       0.84      0.72      0.78      2938
    surprise       0.73      0.75      0.74      4707
     neutral       0.86      0.75      0.80     14219

    accuracy                           0.81     43410
   macro avg       0.77      0.76      0.76     43410
weighted avg       0.82      0.81      0.81     43410



In [45]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test,y_pred_test_electra)))
print(confusion_matrix(y_test,y_pred_test_electra))

accuracy score: 0.6457
[[ 310   11    5   73   19   32  145]
 [  39   36    3   10    7    4   13]
 [   5    3   56    5    4    9    5]
 [  49    6    7 1587   16   50  200]
 [  44    4    5   56  163   26   43]
 [  45    4    7   94    8  302  130]
 [ 179   10   12  342   36  158 1050]]


In [46]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.46      0.52      0.49       595
     disgust       0.49      0.32      0.39       112
        fear       0.59      0.64      0.62        87
         joy       0.73      0.83      0.78      1915
     sadness       0.64      0.48      0.55       341
    surprise       0.52      0.51      0.52       590
     neutral       0.66      0.59      0.62      1787

    accuracy                           0.65      5427
   macro avg       0.59      0.56      0.57      5427
weighted avg       0.64      0.65      0.64      5427



In [47]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val,y_pred_val_electra))
print(classification_report(y_val, y_pred_val_electra, target_names=class_label_names) )

[[ 336   18    5   52    9   26  136]
 [  24   29    3   11    2    4    8]
 [   3    1   57    6    3    7   12]
 [  53    2    5 1685   21   41  190]
 [  28    4    4   59  175   16   66]
 [  42    0    4   92   19  275  127]
 [ 212    8   11  356   42  155  982]]
              precision    recall  f1-score   support

       anger       0.48      0.58      0.53       582
     disgust       0.47      0.36      0.41        81
        fear       0.64      0.64      0.64        89
         joy       0.75      0.84      0.79      1997
     sadness       0.65      0.50      0.56       352
    surprise       0.52      0.49      0.51       559
     neutral       0.65      0.56      0.60      1766

    accuracy                           0.65      5426
   macro avg       0.59      0.57      0.58      5426
weighted avg       0.65      0.65      0.65      5426



In [48]:
## Saving model
electra_model.save_pretrained('electra-emotion-predictor-ekman-1')

In [49]:
electra_model1 = TFAutoModelForSequenceClassification.from_pretrained('electra-emotion-predictor-ekman-1')

### Model 3

In [38]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [39]:
X_train.values

array(['my favourite food is anything i did not have to cook myself',
       'now if he does off himself everyone will think hes having a laugh screwing with people instead of actually dead',
       'why the fuck is bayless isoing', ...,
       'what are you talking about ? anything bad that happened was name fault only good things were name doing !',
       'more like a baptism with sexy results !', 'enjoy the ride !'],
      dtype=object)

In [40]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 5
MAX_LENGTH = 56
NUM_LABELS = 7

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train.values,
                                                         y_train.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val.values,
                                                      y_val.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test.values,
                                                      y_test.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 43410/43410 [00:03<00:00, 11359.93it/s]


Data Shapes : 
(43410, 56) (43410, 56) (43410,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 5426/5426 [00:00<00:00, 11481.10it/s]


Data Shapes : 
(5426, 56) (5426, 56) (5426,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 5427/5427 [00:00<00:00, 11053.20it/s]

Data Shapes : 
(5427, 56) (5427, 56) (5427,)





In [41]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [42]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 4
MAX_LENGTH = 56
NUM_LABELS = 7
learning_rate = 3e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [43]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train,y_pred_train_electra)))
print(confusion_matrix(y_train,y_pred_train_electra))

accuracy score: 0.7486
[[ 2873   104    45   400   290   249   556]
 [  189   309    70    27    56    17    26]
 [   12    18   506    20    40    20    26]
 [  165    14    33 14197   188   359   737]
 [  224    58    50   188  2182   100   136]
 [  108     6    41   562    82  3524   384]
 [  932    37    91  2221   426  1608  8904]]


In [44]:
print(classification_report(y_train, y_pred_train_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.64      0.64      0.64      4517
     disgust       0.57      0.45      0.50       694
        fear       0.61      0.79      0.68       642
         joy       0.81      0.90      0.85     15693
     sadness       0.67      0.74      0.70      2938
    surprise       0.60      0.75      0.67      4707
     neutral       0.83      0.63      0.71     14219

    accuracy                           0.75     43410
   macro avg       0.67      0.70      0.68     43410
weighted avg       0.76      0.75      0.75     43410



In [45]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test,y_pred_test_electra)))
print(confusion_matrix(y_test,y_pred_test_electra))

accuracy score: 0.6517
[[ 285   11    6   66   36   48  143]
 [  27   39   11    8   15    7    5]
 [   2    3   61    5    5    7    4]
 [  34    1    9 1613   32   61  165]
 [  37    4    5   26  210   26   33]
 [  30    3   10   87   10  372   78]
 [ 143   15   12  346   90  224  957]]


In [46]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.51      0.48      0.49       595
     disgust       0.51      0.35      0.41       112
        fear       0.54      0.70      0.61        87
         joy       0.75      0.84      0.79      1915
     sadness       0.53      0.62      0.57       341
    surprise       0.50      0.63      0.56       590
     neutral       0.69      0.54      0.60      1787

    accuracy                           0.65      5427
   macro avg       0.58      0.59      0.58      5427
weighted avg       0.65      0.65      0.65      5427



In [47]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val,y_pred_val_electra))
print(classification_report(y_val, y_pred_val_electra, target_names=class_label_names) )

[[ 289   15    6   68   38   34  132]
 [  22   32    4    9    2    5    7]
 [   1    2   58    7    4    5   12]
 [  37    1    4 1712   34   54  155]
 [  22    9    7   42  222   10   40]
 [  28    2    6   85   21  351   66]
 [ 175    6   13  354   79  210  929]]
              precision    recall  f1-score   support

       anger       0.50      0.50      0.50       582
     disgust       0.48      0.40      0.43        81
        fear       0.59      0.65      0.62        89
         joy       0.75      0.86      0.80      1997
     sadness       0.56      0.63      0.59       352
    surprise       0.52      0.63      0.57       559
     neutral       0.69      0.53      0.60      1766

    accuracy                           0.66      5426
   macro avg       0.59      0.60      0.59      5426
weighted avg       0.66      0.66      0.66      5426



In [50]:
## Saving model
electra_model.save_pretrained('electra-emotion-predictor-ekman-2.1')

In [51]:
electra_model1 = TFAutoModelForSequenceClassification.from_pretrained('electra-emotion-predictor-ekman-2.1')

### Model 4

In [52]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [53]:
X_train.values

array(['my favourite food is anything i did not have to cook myself',
       'now if he does off himself everyone will think hes having a laugh screwing with people instead of actually dead',
       'why the fuck is bayless isoing', ...,
       'what are you talking about ? anything bad that happened was name fault only good things were name doing !',
       'more like a baptism with sexy results !', 'enjoy the ride !'],
      dtype=object)

In [54]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 5
MAX_LENGTH = 56
NUM_LABELS = 7

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train.values,
                                                         y_train.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val.values,
                                                      y_val.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test.values,
                                                      y_test.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 43410/43410 [00:03<00:00, 11788.21it/s]


Data Shapes : 
(43410, 56) (43410, 56) (43410,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 5426/5426 [00:00<00:00, 11724.38it/s]


Data Shapes : 
(5426, 56) (5426, 56) (5426,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 5427/5427 [00:00<00:00, 11684.60it/s]


Data Shapes : 
(5427, 56) (5427, 56) (5427,)


In [55]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [56]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 5
MAX_LENGTH = 56
NUM_LABELS = 7
learning_rate = 5e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [57]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train,y_pred_train_electra)))
print(confusion_matrix(y_train,y_pred_train_electra))

accuracy score: 0.8092
[[ 3157   268    20   265    98   121   588]
 [  122   471    21    21    17     9    33]
 [    6    52   525    11    13     2    33]
 [  134    29    69 14224   124   344   769]
 [  224    96    64   110  2199    54   191]
 [  107    18    39   337    57  3628   521]
 [  676    83    75  1346   179   936 10924]]


In [58]:
print(classification_report(y_train, y_pred_train_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.71      0.70      0.71      4517
     disgust       0.46      0.68      0.55       694
        fear       0.65      0.82      0.72       642
         joy       0.87      0.91      0.89     15693
     sadness       0.82      0.75      0.78      2938
    surprise       0.71      0.77      0.74      4707
     neutral       0.84      0.77      0.80     14219

    accuracy                           0.81     43410
   macro avg       0.72      0.77      0.74     43410
weighted avg       0.81      0.81      0.81     43410



In [59]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test,y_pred_test_electra)))
print(confusion_matrix(y_test,y_pred_test_electra))

accuracy score: 0.6527
[[ 261   33    4   67   17   26  187]
 [  27   56    5    8    1    3   12]
 [   2   10   61    5    1    3    5]
 [  32    6   13 1551   26   77  210]
 [  36   12   10   25  188   20   50]
 [  32    8    9   69    9  337  126]
 [ 142   23   14  319   40  161 1088]]


In [60]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.49      0.44      0.46       595
     disgust       0.38      0.50      0.43       112
        fear       0.53      0.70      0.60        87
         joy       0.76      0.81      0.78      1915
     sadness       0.67      0.55      0.60       341
    surprise       0.54      0.57      0.55       590
     neutral       0.65      0.61      0.63      1787

    accuracy                           0.65      5427
   macro avg       0.57      0.60      0.58      5427
weighted avg       0.65      0.65      0.65      5427



In [61]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val,y_pred_val_electra))
print(classification_report(y_val, y_pred_val_electra, target_names=class_label_names) )

[[ 257   40    5   63   13   26  178]
 [  16   40    3    9    1    3    9]
 [   5    9   58    3    3    1   10]
 [  44    5   15 1619   32   61  221]
 [  29   13    5   33  193   12   67]
 [  35    8    6   73   20  310  107]
 [ 170   20   11  315   35  166 1049]]
              precision    recall  f1-score   support

       anger       0.46      0.44      0.45       582
     disgust       0.30      0.49      0.37        81
        fear       0.56      0.65      0.60        89
         joy       0.77      0.81      0.79      1997
     sadness       0.65      0.55      0.59       352
    surprise       0.54      0.55      0.54       559
     neutral       0.64      0.59      0.62      1766

    accuracy                           0.65      5426
   macro avg       0.56      0.58      0.57      5426
weighted avg       0.65      0.65      0.65      5426



In [62]:
## Saving model
electra_model.save_pretrained('electra-emotion-predictor-ekman-4')

In [63]:
electra_model1 = TFAutoModelForSequenceClassification.from_pretrained('electra-emotion-predictor-ekman-4')

### Model 5 - Final

In [64]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [65]:
X_train.values

array(['my favourite food is anything i did not have to cook myself',
       'now if he does off himself everyone will think hes having a laugh screwing with people instead of actually dead',
       'why the fuck is bayless isoing', ...,
       'what are you talking about ? anything bad that happened was name fault only good things were name doing !',
       'more like a baptism with sexy results !', 'enjoy the ride !'],
      dtype=object)

In [66]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 5
MAX_LENGTH = 56
NUM_LABELS = 7

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train.values,
                                                         y_train.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val.values,
                                                      y_val.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test.values,
                                                      y_test.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 43410/43410 [00:03<00:00, 11891.72it/s]


Data Shapes : 
(43410, 56) (43410, 56) (43410,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 5426/5426 [00:00<00:00, 11848.37it/s]


Data Shapes : 
(5426, 56) (5426, 56) (5426,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 5427/5427 [00:00<00:00, 11811.84it/s]


Data Shapes : 
(5427, 56) (5427, 56) (5427,)


In [67]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [68]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 5
MAX_LENGTH = 56
NUM_LABELS = 7
learning_rate = 3e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [69]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train,y_pred_train_electra)))
print(confusion_matrix(y_train,y_pred_train_electra))

accuracy score: 0.7883
[[ 2912   110    27   241   240   178   809]
 [  191   333    41    14    61    17    37]
 [    9    30   494    15    51    12    31]
 [  122    25    39 14085   210   346   866]
 [  160    50    38   100  2344    60   186]
 [  102     9    42   423    77  3512   542]
 [  626    28    56  1472   291  1204 10542]]


In [70]:
print(classification_report(y_train, y_pred_train_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.71      0.64      0.67      4517
     disgust       0.57      0.48      0.52       694
        fear       0.67      0.77      0.72       642
         joy       0.86      0.90      0.88     15693
     sadness       0.72      0.80      0.75      2938
    surprise       0.66      0.75      0.70      4707
     neutral       0.81      0.74      0.77     14219

    accuracy                           0.79     43410
   macro avg       0.71      0.73      0.72     43410
weighted avg       0.79      0.79      0.79     43410



In [71]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test,y_pred_test_electra)))
print(confusion_matrix(y_test,y_pred_test_electra))

accuracy score: 0.6632
[[ 255   15    5   56   29   37  198]
 [  32   48    6    5    5    4   12]
 [   2    4   62    5    4    5    5]
 [  28    3    5 1573   43   56  207]
 [  27    6    6   20  216   16   50]
 [  21    2    9   81   18  349  110]
 [ 108   16    7  306   66  188 1096]]


In [72]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.54      0.43      0.48       595
     disgust       0.51      0.43      0.47       112
        fear       0.62      0.71      0.66        87
         joy       0.77      0.82      0.79      1915
     sadness       0.57      0.63      0.60       341
    surprise       0.53      0.59      0.56       590
     neutral       0.65      0.61      0.63      1787

    accuracy                           0.66      5427
   macro avg       0.60      0.60      0.60      5427
weighted avg       0.66      0.66      0.66      5427



In [73]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val,y_pred_val_electra))
print(classification_report(y_val, y_pred_val_electra, target_names=class_label_names) )

[[ 274   17    1   47   22   32  189]
 [  23   35    4    7    1    3    8]
 [   0    4   57    5    5    3   15]
 [  32    4    5 1661   43   56  196]
 [  24    6    7   26  224    8   57]
 [  26    0    5   81   24  306  117]
 [ 133    4   12  287   56  169 1105]]
              precision    recall  f1-score   support

       anger       0.54      0.47      0.50       582
     disgust       0.50      0.43      0.46        81
        fear       0.63      0.64      0.63        89
         joy       0.79      0.83      0.81      1997
     sadness       0.60      0.64      0.62       352
    surprise       0.53      0.55      0.54       559
     neutral       0.66      0.63      0.64      1766

    accuracy                           0.67      5426
   macro avg       0.60      0.60      0.60      5426
weighted avg       0.67      0.67      0.67      5426



In [74]:
## Saving model
electra_model.save_pretrained('electra-emotion-predictor-ekman-5')

In [75]:
electra_model1 = TFAutoModelForSequenceClassification.from_pretrained('electra-emotion-predictor-ekman-5')

### Model 6

In [76]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [77]:
X_train.values

array(['my favourite food is anything i did not have to cook myself',
       'now if he does off himself everyone will think hes having a laugh screwing with people instead of actually dead',
       'why the fuck is bayless isoing', ...,
       'what are you talking about ? anything bad that happened was name fault only good things were name doing !',
       'more like a baptism with sexy results !', 'enjoy the ride !'],
      dtype=object)

In [78]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 6
MAX_LENGTH = 56
NUM_LABELS = 7

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train.values,
                                                         y_train.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val.values,
                                                      y_val.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test.values,
                                                      y_test.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 43410/43410 [00:03<00:00, 12084.08it/s]


Data Shapes : 
(43410, 56) (43410, 56) (43410,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 5426/5426 [00:00<00:00, 11795.44it/s]


Data Shapes : 
(5426, 56) (5426, 56) (5426,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 5427/5427 [00:00<00:00, 11873.74it/s]

Data Shapes : 
(5427, 56) (5427, 56) (5427,)





In [79]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [80]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 6
MAX_LENGTH = 56
NUM_LABELS = 7
learning_rate = 3e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [81]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train,y_pred_train_electra)))
print(confusion_matrix(y_train,y_pred_train_electra))

accuracy score: 0.8049
[[ 3279   122    21   348   144   139   464]
 [  159   389    37    30    36    17    26]
 [    9    14   544    22    15    20    18]
 [  119    17    36 14664    86   222   549]
 [  195    43    55   175  2267    66   137]
 [  104     5    43   488    49  3633   385]
 [  668    44    66  2057   192  1028 10164]]


In [82]:
print(classification_report(y_train, y_pred_train_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.72      0.73      0.72      4517
     disgust       0.61      0.56      0.59       694
        fear       0.68      0.85      0.75       642
         joy       0.82      0.93      0.88     15693
     sadness       0.81      0.77      0.79      2938
    surprise       0.71      0.77      0.74      4707
     neutral       0.87      0.71      0.78     14219

    accuracy                           0.80     43410
   macro avg       0.75      0.76      0.75     43410
weighted avg       0.81      0.80      0.80     43410



In [83]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test,y_pred_test_electra)))
print(confusion_matrix(y_test,y_pred_test_electra))

accuracy score: 0.6471
[[ 267   16    3   88   26   35  160]
 [  26   49    4   12    8    6    7]
 [   1    4   68    5    2    3    4]
 [  28    4    8 1642   18   65  150]
 [  39    6    8   45  182   21   40]
 [  24    8    8  100   14  329  107]
 [ 146   13   18  404   50  181  975]]


In [84]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.50      0.45      0.47       595
     disgust       0.49      0.44      0.46       112
        fear       0.58      0.78      0.67        87
         joy       0.72      0.86      0.78      1915
     sadness       0.61      0.53      0.57       341
    surprise       0.51      0.56      0.53       590
     neutral       0.68      0.55      0.60      1787

    accuracy                           0.65      5427
   macro avg       0.58      0.59      0.58      5427
weighted avg       0.64      0.65      0.64      5427



In [85]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val,y_pred_val_electra))
print(classification_report(y_val, y_pred_val_electra, target_names=class_label_names) )

[[ 289   27    3   70   18   29  146]
 [  17   31    4   19    2    2    6]
 [   1    3   62    9    3    5    6]
 [  30    5   11 1716   24   56  155]
 [  23    8    7   47  205   15   47]
 [  30    2    9  109   16  299   94]
 [ 170    7   13  408   42  165  961]]
              precision    recall  f1-score   support

       anger       0.52      0.50      0.51       582
     disgust       0.37      0.38      0.38        81
        fear       0.57      0.70      0.63        89
         joy       0.72      0.86      0.78      1997
     sadness       0.66      0.58      0.62       352
    surprise       0.52      0.53      0.53       559
     neutral       0.68      0.54      0.60      1766

    accuracy                           0.66      5426
   macro avg       0.58      0.59      0.58      5426
weighted avg       0.65      0.66      0.65      5426



In [86]:
## Saving model
electra_model.save_pretrained('electra-emotion-predictor-ekman-6')

In [87]:
electra_model1 = TFAutoModelForSequenceClassification.from_pretrained('electra-emotion-predictor-ekman-6')

### Model 7

In [88]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [89]:
X_train.values

array(['my favourite food is anything i did not have to cook myself',
       'now if he does off himself everyone will think hes having a laugh screwing with people instead of actually dead',
       'why the fuck is bayless isoing', ...,
       'what are you talking about ? anything bad that happened was name fault only good things were name doing !',
       'more like a baptism with sexy results !', 'enjoy the ride !'],
      dtype=object)

In [90]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 7
MAX_LENGTH = 56
NUM_LABELS = 7

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train.values,
                                                         y_train.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val.values,
                                                      y_val.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test.values,
                                                      y_test.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 43410/43410 [00:03<00:00, 11719.33it/s]


Data Shapes : 
(43410, 56) (43410, 56) (43410,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 5426/5426 [00:00<00:00, 11875.47it/s]


Data Shapes : 
(5426, 56) (5426, 56) (5426,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 5427/5427 [00:00<00:00, 11747.76it/s]


Data Shapes : 
(5427, 56) (5427, 56) (5427,)


In [91]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [92]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 7
MAX_LENGTH = 56
NUM_LABELS = 7
learning_rate = 3e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [93]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train,y_pred_train_electra)))
print(confusion_matrix(y_train,y_pred_train_electra))

accuracy score: 0.8331
[[ 3540    85    22   175   116   123   456]
 [  173   369    71    14    21    18    28]
 [   14    16   555    13     9    11    24]
 [  162    17    37 14247   128   273   829]
 [  179    28    34    81  2417    46   153]
 [  111     6    22   306    62  3302   898]
 [  715    21    58   996   203   489 11737]]


In [94]:
print(classification_report(y_train, y_pred_train_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.72      0.78      0.75      4517
     disgust       0.68      0.53      0.60       694
        fear       0.69      0.86      0.77       642
         joy       0.90      0.91      0.90     15693
     sadness       0.82      0.82      0.82      2938
    surprise       0.77      0.70      0.74      4707
     neutral       0.83      0.83      0.83     14219

    accuracy                           0.83     43410
   macro avg       0.77      0.78      0.77     43410
weighted avg       0.83      0.83      0.83     43410



In [95]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test,y_pred_test_electra)))
print(confusion_matrix(y_test,y_pred_test_electra))

accuracy score: 0.6508
[[ 291   15    4   59   24   27  175]
 [  35   40   12    6    6    7    6]
 [   3    5   60    7    2    4    6]
 [  41    2    8 1530   33   64  237]
 [  42    7    6   30  192   16   48]
 [  30    3    9   79   13  271  185]
 [ 166   18   12  281   51  111 1148]]


In [96]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.48      0.49      0.48       595
     disgust       0.44      0.36      0.40       112
        fear       0.54      0.69      0.61        87
         joy       0.77      0.80      0.78      1915
     sadness       0.60      0.56      0.58       341
    surprise       0.54      0.46      0.50       590
     neutral       0.64      0.64      0.64      1787

    accuracy                           0.65      5427
   macro avg       0.57      0.57      0.57      5427
weighted avg       0.65      0.65      0.65      5427



In [97]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val,y_pred_val_electra))
print(classification_report(y_val, y_pred_val_electra, target_names=class_label_names) )

[[ 314   16    5   42   14   24  167]
 [  22   32    3    9    2    4    9]
 [   3    1   60    9    4    3    9]
 [  42    4    2 1609   41   47  252]
 [  31    9    4   38  203   15   52]
 [  40    3    4   82   20  237  173]
 [ 191    8   15  267   47   96 1142]]
              precision    recall  f1-score   support

       anger       0.49      0.54      0.51       582
     disgust       0.44      0.40      0.42        81
        fear       0.65      0.67      0.66        89
         joy       0.78      0.81      0.79      1997
     sadness       0.61      0.58      0.59       352
    surprise       0.56      0.42      0.48       559
     neutral       0.63      0.65      0.64      1766

    accuracy                           0.66      5426
   macro avg       0.59      0.58      0.59      5426
weighted avg       0.66      0.66      0.66      5426



In [98]:
## Saving model
electra_model.save_pretrained('electra-emotion-predictor-ekman-7')

In [99]:
electra_model1 = TFAutoModelForSequenceClassification.from_pretrained('electra-emotion-predictor-ekman-7')

### Model 8

In [100]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [101]:
X_train.values

array(['my favourite food is anything i did not have to cook myself',
       'now if he does off himself everyone will think hes having a laugh screwing with people instead of actually dead',
       'why the fuck is bayless isoing', ...,
       'what are you talking about ? anything bad that happened was name fault only good things were name doing !',
       'more like a baptism with sexy results !', 'enjoy the ride !'],
      dtype=object)

In [102]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 8
MAX_LENGTH = 56
NUM_LABELS = 7

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train.values,
                                                         y_train.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val.values,
                                                      y_val.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test.values,
                                                      y_test.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 43410/43410 [00:03<00:00, 11459.56it/s]


Data Shapes : 
(43410, 56) (43410, 56) (43410,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 5426/5426 [00:00<00:00, 11268.37it/s]


Data Shapes : 
(5426, 56) (5426, 56) (5426,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 5427/5427 [00:00<00:00, 11482.26it/s]


Data Shapes : 
(5427, 56) (5427, 56) (5427,)


In [103]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [104]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 8
MAX_LENGTH = 56
NUM_LABELS = 7
learning_rate = 3e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [105]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train,y_pred_train_electra)))
print(confusion_matrix(y_train,y_pred_train_electra))

accuracy score: 0.8418
[[ 3535   149    25   193   138   127   350]
 [  119   458    34    18    40    10    15]
 [   11    11   568     9    17     9    17]
 [  148    23    45 14585   127   167   598]
 [  150    58    48   103  2424    53   102]
 [   81    15    62   394    65  3765   325]
 [  655    51    68  1094   237   906 11208]]


In [106]:
print(classification_report(y_train, y_pred_train_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.75      0.78      0.77      4517
     disgust       0.60      0.66      0.63       694
        fear       0.67      0.88      0.76       642
         joy       0.89      0.93      0.91     15693
     sadness       0.80      0.83      0.81      2938
    surprise       0.75      0.80      0.77      4707
     neutral       0.89      0.79      0.84     14219

    accuracy                           0.84     43410
   macro avg       0.76      0.81      0.78     43410
weighted avg       0.85      0.84      0.84     43410



In [107]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test,y_pred_test_electra)))
print(confusion_matrix(y_test,y_pred_test_electra))

accuracy score: 0.6436
[[ 288   25    7   63   41   27  144]
 [  26   51    6    6    8    7    8]
 [   2    3   63    5    6    5    3]
 [  42    5   12 1564   41   57  194]
 [  34    5   10   32  197   18   45]
 [  31    7    9   85   20  334  104]
 [ 167   22   17  320   76  189  996]]


In [108]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test_electra, target_names=class_label_names) )

              precision    recall  f1-score   support

       anger       0.49      0.48      0.49       595
     disgust       0.43      0.46      0.44       112
        fear       0.51      0.72      0.60        87
         joy       0.75      0.82      0.78      1915
     sadness       0.51      0.58      0.54       341
    surprise       0.52      0.57      0.54       590
     neutral       0.67      0.56      0.61      1787

    accuracy                           0.64      5427
   macro avg       0.55      0.60      0.57      5427
weighted avg       0.64      0.64      0.64      5427



In [109]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val,y_pred_val_electra))
print(classification_report(y_val, y_pred_val_electra, target_names=class_label_names) )

[[ 303   29    2   62   22   28  136]
 [  16   38    7    8    3    2    7]
 [   6    5   59    6    2    1   10]
 [  50    4    8 1633   39   57  206]
 [  24   10    8   43  218   10   39]
 [  33    4   10   94   22  294  102]
 [ 199   12   21  330   76  175  953]]
              precision    recall  f1-score   support

       anger       0.48      0.52      0.50       582
     disgust       0.37      0.47      0.42        81
        fear       0.51      0.66      0.58        89
         joy       0.75      0.82      0.78      1997
     sadness       0.57      0.62      0.59       352
    surprise       0.52      0.53      0.52       559
     neutral       0.66      0.54      0.59      1766

    accuracy                           0.64      5426
   macro avg       0.55      0.59      0.57      5426
weighted avg       0.65      0.64      0.64      5426



In [110]:
## Saving model
electra_model.save_pretrained('electra-emotion-predictor-ekman-8')

In [111]:
electra_model1 = TFAutoModelForSequenceClassification.from_pretrained('electra-emotion-predictor-ekman-8')

### Model 1 - Ekman without neutral emotion

In [114]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [115]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 5
MAX_LENGTH = 56
NUM_LABELS = 6

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train_no_neu.values,
                                                         y_train_no_neu.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val_no_neu.values,
                                                      y_val_no_neu.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test_no_neu.values,
                                                      y_test_no_neu.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 29191/29191 [00:02<00:00, 12047.77it/s]


Data Shapes : 
(29191, 56) (29191, 56) (29191,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 3660/3660 [00:00<00:00, 12269.22it/s]


Data Shapes : 
(3660, 56) (3660, 56) (3660,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 3640/3640 [00:00<00:00, 12036.36it/s]

Data Shapes : 
(3640, 56) (3640, 56) (3640,)





In [116]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [117]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 5
MAX_LENGTH = 56
NUM_LABELS = 6
learning_rate = 3e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [118]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train_no_neu,y_pred_train_electra)))
print(confusion_matrix(y_train_no_neu,y_pred_train_electra))

accuracy score: 0.8740
[[ 3758    75    13   410   103   158]
 [  232   369    17    42    24    10]
 [   24    29   507    27    18    37]
 [  282    22    23 14868   147   351]
 [  348    37    31   194  2233    95]
 [  250    11    25   565    78  3778]]


In [119]:
print(classification_report(y_train_no_neu, y_pred_train_electra, target_names=class_label_names_no_neu) )

              precision    recall  f1-score   support

       anger       0.77      0.83      0.80      4517
     disgust       0.68      0.53      0.60       694
        fear       0.82      0.79      0.81       642
         joy       0.92      0.95      0.94     15693
     sadness       0.86      0.76      0.81      2938
    surprise       0.85      0.80      0.83      4707

    accuracy                           0.87     29191
   macro avg       0.82      0.78      0.79     29191
weighted avg       0.87      0.87      0.87     29191



In [120]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test_no_neu,y_pred_test_electra)))
print(confusion_matrix(y_test_no_neu,y_pred_test_electra))

accuracy score: 0.7668
[[ 401   15    3  108   24   44]
 [  45   46    1   11    2    7]
 [   7    6   55    7    4    8]
 [  76    3    7 1710   34   85]
 [  53    6    4   50  206   22]
 [  66    2    9  127   13  373]]


In [121]:
from sklearn.metrics import classification_report
print(classification_report(y_test_no_neu, y_pred_test_electra, target_names=class_label_names_no_neu) )

              precision    recall  f1-score   support

       anger       0.62      0.67      0.65       595
     disgust       0.59      0.41      0.48       112
        fear       0.70      0.63      0.66        87
         joy       0.85      0.89      0.87      1915
     sadness       0.73      0.60      0.66       341
    surprise       0.69      0.63      0.66       590

    accuracy                           0.77      3640
   macro avg       0.70      0.64      0.66      3640
weighted avg       0.76      0.77      0.76      3640



In [122]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val_no_neu,y_pred_val_electra))
print(classification_report(y_val_no_neu, y_pred_val_electra, target_names=class_label_names_no_neu) )

[[ 405   18    4   95   23   37]
 [  31   33    3    9    1    4]
 [   7    6   57    6    5    8]
 [  85    3    5 1800   32   72]
 [  43   11    6   53  215   24]
 [  66    3    6  116   21  347]]
              precision    recall  f1-score   support

       anger       0.64      0.70      0.66       582
     disgust       0.45      0.41      0.43        81
        fear       0.70      0.64      0.67        89
         joy       0.87      0.90      0.88      1997
     sadness       0.72      0.61      0.66       352
    surprise       0.71      0.62      0.66       559

    accuracy                           0.78      3660
   macro avg       0.68      0.65      0.66      3660
weighted avg       0.78      0.78      0.78      3660



In [123]:
## Saving model
electra_model.save_pretrained('electra-emotion-predictor-ekman-no-neutral-1')

In [125]:
electra_model1 = TFAutoModelForSequenceClassification.from_pretrained('electra-emotion-predictor-ekman-no-neutral-1')

### Model 2 - Ekman without neutral emotion

In [126]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [127]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 6
MAX_LENGTH = 56
NUM_LABELS = 6

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train_no_neu.values,
                                                         y_train_no_neu.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val_no_neu.values,
                                                      y_val_no_neu.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test_no_neu.values,
                                                      y_test_no_neu.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 29191/29191 [00:02<00:00, 11678.45it/s]


Data Shapes : 
(29191, 56) (29191, 56) (29191,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 3660/3660 [00:00<00:00, 11626.03it/s]


Data Shapes : 
(3660, 56) (3660, 56) (3660,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 3640/3640 [00:00<00:00, 11749.25it/s]

Data Shapes : 
(3640, 56) (3640, 56) (3640,)





In [128]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [129]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 6
MAX_LENGTH = 56
NUM_LABELS = 6
learning_rate = 3e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [130]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train_no_neu,y_pred_train_electra)))
print(confusion_matrix(y_train_no_neu,y_pred_train_electra))

accuracy score: 0.8850
[[ 3691   129    20   332   147   198]
 [  144   450    27    32    30    11]
 [   14    21   549    14    20    24]
 [  261    28    43 14729   206   426]
 [  194    48    38   149  2412    97]
 [  154    14    33   434    70  4002]]


In [131]:
print(classification_report(y_train_no_neu, y_pred_train_electra, target_names=class_label_names_no_neu) )

              precision    recall  f1-score   support

       anger       0.83      0.82      0.82      4517
     disgust       0.65      0.65      0.65       694
        fear       0.77      0.86      0.81       642
         joy       0.94      0.94      0.94     15693
     sadness       0.84      0.82      0.83      2938
    surprise       0.84      0.85      0.85      4707

    accuracy                           0.88     29191
   macro avg       0.81      0.82      0.82     29191
weighted avg       0.89      0.88      0.88     29191



In [132]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test_no_neu,y_pred_test_electra)))
print(confusion_matrix(y_test_no_neu,y_pred_test_electra))

accuracy score: 0.7703
[[ 378   27    4   97   34   55]
 [  33   54    5    6    6    8]
 [   5    6   60    7    4    5]
 [  78    2   11 1686   42   96]
 [  45    8    5   39  217   27]
 [  43    8    8  104   18  409]]


In [133]:
from sklearn.metrics import classification_report
print(classification_report(y_test_no_neu, y_pred_test_electra, target_names=class_label_names_no_neu) )

              precision    recall  f1-score   support

       anger       0.65      0.64      0.64       595
     disgust       0.51      0.48      0.50       112
        fear       0.65      0.69      0.67        87
         joy       0.87      0.88      0.87      1915
     sadness       0.68      0.64      0.66       341
    surprise       0.68      0.69      0.69       590

    accuracy                           0.77      3640
   macro avg       0.67      0.67      0.67      3640
weighted avg       0.77      0.77      0.77      3640



In [134]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val_no_neu,y_pred_val_electra))
print(classification_report(y_val_no_neu, y_pred_val_electra, target_names=class_label_names_no_neu) )

[[ 377   28    6   85   29   57]
 [  23   39    3    8    2    6]
 [   4    6   62    5    6    6]
 [  70    3    8 1775   46   95]
 [  37   11    7   40  229   28]
 [  49    5    6   96   25  378]]
              precision    recall  f1-score   support

       anger       0.67      0.65      0.66       582
     disgust       0.42      0.48      0.45        81
        fear       0.67      0.70      0.69        89
         joy       0.88      0.89      0.89      1997
     sadness       0.68      0.65      0.66       352
    surprise       0.66      0.68      0.67       559

    accuracy                           0.78      3660
   macro avg       0.67      0.67      0.67      3660
weighted avg       0.78      0.78      0.78      3660



In [135]:
## Saving model
electra_model.save_pretrained('electra-emotion-predictor-ekman-no-neutral-2')

In [137]:
electra_model1 = TFAutoModelForSequenceClassification.from_pretrained('electra-emotion-predictor-ekman-no-neutral-2')

### Model 3 - Final

In [138]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [139]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 7
MAX_LENGTH = 56
NUM_LABELS = 6

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train_no_neu.values,
                                                         y_train_no_neu.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val_no_neu.values,
                                                      y_val_no_neu.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test_no_neu.values,
                                                      y_test_no_neu.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 29191/29191 [00:02<00:00, 11574.56it/s]


Data Shapes : 
(29191, 56) (29191, 56) (29191,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 3660/3660 [00:00<00:00, 11104.38it/s]


Data Shapes : 
(3660, 56) (3660, 56) (3660,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 3640/3640 [00:00<00:00, 11311.06it/s]

Data Shapes : 
(3640, 56) (3640, 56) (3640,)





In [140]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [141]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 7
MAX_LENGTH = 56
NUM_LABELS = 6
learning_rate = 3e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [142]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train_no_neu,y_pred_train_electra)))
print(confusion_matrix(y_train_no_neu,y_pred_train_electra))

accuracy score: 0.8904
[[ 3721    93    19   333   152   199]
 [  139   476    12    27    26    14]
 [   14    20   541    19    24    24]
 [  242    20    54 14684   193   500]
 [  196    35    34   126  2453    94]
 [  122    13    47   324    83  4118]]


In [143]:
print(classification_report(y_train_no_neu, y_pred_train_electra, target_names=class_label_names_no_neu) )

              precision    recall  f1-score   support

       anger       0.84      0.82      0.83      4517
     disgust       0.72      0.69      0.70       694
        fear       0.77      0.84      0.80       642
         joy       0.95      0.94      0.94     15693
     sadness       0.84      0.83      0.84      2938
    surprise       0.83      0.87      0.85      4707

    accuracy                           0.89     29191
   macro avg       0.82      0.83      0.83     29191
weighted avg       0.89      0.89      0.89     29191



In [144]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test_no_neu,y_pred_test_electra)))
print(confusion_matrix(y_test_no_neu,y_pred_test_electra))

accuracy score: 0.7709
[[ 372   19    7  103   34   60]
 [  30   56    3    8    8    7]
 [   4    5   65    5    4    4]
 [  66    3   12 1669   43  122]
 [  38    5    8   40  220   30]
 [  39    8    9   96   14  424]]


In [145]:
from sklearn.metrics import classification_report
print(classification_report(y_test_no_neu, y_pred_test_electra, target_names=class_label_names_no_neu) )

              precision    recall  f1-score   support

       anger       0.68      0.63      0.65       595
     disgust       0.58      0.50      0.54       112
        fear       0.62      0.75      0.68        87
         joy       0.87      0.87      0.87      1915
     sadness       0.68      0.65      0.66       341
    surprise       0.66      0.72      0.69       590

    accuracy                           0.77      3640
   macro avg       0.68      0.68      0.68      3640
weighted avg       0.77      0.77      0.77      3640



In [146]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val_no_neu,y_pred_val_electra))
print(classification_report(y_val_no_neu, y_pred_val_electra, target_names=class_label_names_no_neu) )

[[ 378   28    6   87   36   47]
 [  21   39    3    9    5    4]
 [   5    5   64    5    8    2]
 [  69    2   14 1752   44  116]
 [  40    9    7   41  236   19]
 [  44    4    9   89   28  385]]
              precision    recall  f1-score   support

       anger       0.68      0.65      0.66       582
     disgust       0.45      0.48      0.46        81
        fear       0.62      0.72      0.67        89
         joy       0.88      0.88      0.88      1997
     sadness       0.66      0.67      0.67       352
    surprise       0.67      0.69      0.68       559

    accuracy                           0.78      3660
   macro avg       0.66      0.68      0.67      3660
weighted avg       0.78      0.78      0.78      3660



In [147]:
## Saving model
electra_model.save_pretrained('electra-emotion-predictor-ekman-no-neutral-3')

In [148]:
electra_model1 = TFAutoModelForSequenceClassification.from_pretrained('electra-emotion-predictor-ekman-no-neutral-3')

### Model 4

In [149]:
def create_data(text,labels,tokenizer,max_length):

    # Empty Arrays
    input_ids = []
    attention_mask = []
    
    # Loop Through List and Create input ids & attention mask
    for i in tqdm.tqdm(text,total=len(text)):
        x = tokenizer.encode_plus(str(i),
                              return_attention_mask=True,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length,
                              add_special_tokens=True,
                              )
        input_ids.append(x['input_ids'])
        attention_mask.append(x['attention_mask'])
        
    print('Data Shapes : ')
    print(np.array(input_ids).shape,np.array(attention_mask).shape,labels.shape )
    
    return  np.array(input_ids) , np.array(attention_mask) , np.array(labels).reshape(-1,1)

X_train.values, y_train, X_test, y_test, X_val, y_val

In [150]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 8
MAX_LENGTH = 56
NUM_LABELS = 6

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print('Start Create Train Data')
# Create Train Data
train_ids , train_attention_mask ,train_labels = create_data(X_train_no_neu.values,
                                                         y_train_no_neu.values ,
                                                         tokenizer
                                                         ,MAX_LENGTH)
print('Start Create Val Data')
# Create Test Data
val_ids , val_attention_mask ,val_labels = create_data(X_val_no_neu.values,
                                                      y_val_no_neu.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )
print('Start Create Test Data')
# Create Test Data
test_ids , test_attention_mask ,test_labels = create_data(X_test_no_neu.values,
                                                      y_test_no_neu.values,
                                                      tokenizer ,
                                                      MAX_LENGTH
                                                         )

Start Create Train Data


100%|█████████████████████████████████████████████████████████████████████████| 29191/29191 [00:02<00:00, 11472.37it/s]


Data Shapes : 
(29191, 56) (29191, 56) (29191,)
Start Create Val Data


100%|███████████████████████████████████████████████████████████████████████████| 3660/3660 [00:00<00:00, 11663.10it/s]


Data Shapes : 
(3660, 56) (3660, 56) (3660,)
Start Create Test Data


100%|███████████████████████████████████████████████████████████████████████████| 3640/3640 [00:00<00:00, 11636.62it/s]

Data Shapes : 
(3640, 56) (3640, 56) (3640,)





In [151]:
def train_custom_label_model(model_name,num_labels,MAX_LENGTH,Num_epochs):
    
    gc.collect()
    
    my_model = TFAutoModelForSequenceClassification.from_pretrained(model_name,
                                                                    ignore_mismatched_sizes=True
                                                                   ,num_labels=num_labels
                                                                   )
    
    print('FINISH DOWNLOAD MODEL')
#     run_opts = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True)
    my_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=tf.metrics.SparseCategoricalAccuracy(),
#         options = run_opts
        )

    #train , test = train_test_split(df,test_size=.3,random_state=42)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    print('FINSH DOWNLOAD TOKENIZER')
    
       
    # Use GPU
    tf.debugging.set_log_device_placement(True)

       
    # Model Checkpoint
    checkpoint_filepath = './'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_sparse_categorical_crossentropy',
        mode='max',
        save_best_only=True,
    )
    
    # Train Model
    my_model.fit([train_ids, train_attention_mask],
                 train_labels,
              validation_split=.3,
              epochs=Num_epochs,
              verbose=1,
              callbacks=[model_checkpoint_callback]
              )
    
    gc.collect()
    
    return my_model

In [152]:
MODEL_NAME = 'bhadresh-savani/electra-base-emotion'
EPOCHS = 8
MAX_LENGTH = 56
NUM_LABELS = 6
learning_rate = 3e-5
electra_model = train_custom_label_model(MODEL_NAME,NUM_LABELS,MAX_LENGTH,EPOCHS)

FINISH DOWNLOAD MODEL
FINSH DOWNLOAD TOKENIZER
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [153]:
# train predictions
y_pred_train_electra = electra_model.predict([train_ids,train_attention_mask])
y_pred_train_electra = tf.argmax(y_pred_train_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_train_no_neu,y_pred_train_electra)))
print(confusion_matrix(y_train_no_neu,y_pred_train_electra))

accuracy score: 0.8999
[[ 3927    76    11   240   197    66]
 [  136   487    12    26    28     5]
 [   18    20   542    15    38     9]
 [  330    18    25 14846   237   237]
 [  162    21    19   105  2597    34]
 [  281    11    21   344   179  3871]]


In [154]:
print(classification_report(y_train_no_neu, y_pred_train_electra, target_names=class_label_names_no_neu) )

              precision    recall  f1-score   support

       anger       0.81      0.87      0.84      4517
     disgust       0.77      0.70      0.73       694
        fear       0.86      0.84      0.85       642
         joy       0.95      0.95      0.95     15693
     sadness       0.79      0.88      0.84      2938
    surprise       0.92      0.82      0.87      4707

    accuracy                           0.90     29191
   macro avg       0.85      0.84      0.85     29191
weighted avg       0.90      0.90      0.90     29191



In [155]:
# test predictions
y_pred_test_electra = electra_model.predict([test_ids,test_attention_mask])

y_pred_test_electra = tf.argmax(y_pred_test_electra.logits,axis=1)
print('accuracy score: {:.4f}'.format(accuracy_score(y_test_no_neu,y_pred_test_electra)))
print(confusion_matrix(y_test_no_neu,y_pred_test_electra))

accuracy score: 0.7607
[[ 407   19    1   78   56   34]
 [  32   49    1    8   16    6]
 [   6    6   58    6    8    3]
 [  89    2    4 1677   64   79]
 [  50    5    7   33  233   13]
 [  79    5    8  111   42  345]]


In [156]:
from sklearn.metrics import classification_report
print(classification_report(y_test_no_neu, y_pred_test_electra, target_names=class_label_names_no_neu) )

              precision    recall  f1-score   support

       anger       0.61      0.68      0.65       595
     disgust       0.57      0.44      0.49       112
        fear       0.73      0.67      0.70        87
         joy       0.88      0.88      0.88      1915
     sadness       0.56      0.68      0.61       341
    surprise       0.72      0.58      0.64       590

    accuracy                           0.76      3640
   macro avg       0.68      0.66      0.66      3640
weighted avg       0.77      0.76      0.76      3640



In [157]:
# val predictions
y_pred_val_electra = electra_model.predict([val_ids,val_attention_mask])
y_pred_val_electra = tf.argmax(y_pred_val_electra.logits,axis=1)
print(confusion_matrix(y_val_no_neu,y_pred_val_electra))
print(classification_report(y_val_no_neu, y_pred_val_electra, target_names=class_label_names_no_neu) )

[[ 408   27    4   65   49   29]
 [  24   37    1    9    7    3]
 [   7    4   55    5   14    4]
 [ 116    2    3 1739   72   65]
 [  42   11    4   33  247   15]
 [  81    2    5  107   49  315]]
              precision    recall  f1-score   support

       anger       0.60      0.70      0.65       582
     disgust       0.45      0.46      0.45        81
        fear       0.76      0.62      0.68        89
         joy       0.89      0.87      0.88      1997
     sadness       0.56      0.70      0.63       352
    surprise       0.73      0.56      0.64       559

    accuracy                           0.77      3660
   macro avg       0.67      0.65      0.65      3660
weighted avg       0.77      0.77      0.77      3660



In [158]:
## Saving model
electra_model.save_pretrained('electra-emotion-predictor-ekman-no-neutral-4')

In [159]:
electra_model1 = TFAutoModelForSequenceClassification.from_pretrained('electra-emotion-predictor-ekman-no-neutral-4')