**Importing**

In [None]:
import pickle
import string
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import numpy as np
from keras.utils.np_utils import to_categorical  
import tensorflow_hub as hub
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**MELD Preprocessing**

In [None]:
import pandas as pd 

In [None]:
train_data= pd.read_csv("/content/train_sent_emo_dya.csv")
test_data= pd.read_csv("/content/test_sent_emo_dya.csv")

In [None]:
#Creating a list of strings with MELD dataset sentences
#Input : data in pandas format. Output : List of Strings, List of respective labels
def preprocess_meld(data):
  l=[]
  l=list(data.Utterance)
  m=[]
  m=list(data.Emotion)
  k=[]
  for i in m:
    if i=='anger':
      k.append(0)
    if i=='disgust':
      k.append(1)
    if i=='fear':
      k.append(2)
    if i=='joy':
      k.append(3)
    if i=='neutral':
      k.append(4)
    if i=='sadness':
      k.append(5)
    if i=='surprise':
      k.append(6)
  return l,k

In [None]:
processed_data,processed_label = preprocess_meld(train_data)

In [None]:
test_processed_data,test_processed_label = preprocess_meld(test_data)

**Running on the model**

In [None]:
def ConceptNet_Sentences(data,labels):
  conceptnet_data=[]
  conceptnet_labels=[]
  for sentences in range(0,len(data)):
      current_sentence=data[sentences]
      conceptnet_data.append(current_sentence)
      conceptnet_labels.append(labels[sentences])
      cleaned_sentences=data[sentences].translate(str.maketrans('', '', string.punctuation))
      tokens_with_sw=nltk.word_tokenize(cleaned_sentences)
      tokens= [word for word in tokens_with_sw if not word in stopwords.words()]
      for i in range(1,3):
        temp=current_sentence
        for j in tokens:
          try:
            obj = requests.get('http://api.conceptnet.io/related/c/en/' + j + '?filter=/c/en').json()
            response=obj['related'][i]['@id']
          except Exception:
            continue
          response=re.sub(r'[^\w]', ' ', response)
          response=response[6:]
          temp=temp.replace(j,response)
        conceptnet_data.append(temp)
        conceptnet_labels.append(labels[sentences])
  return conceptnet_data,conceptnet_labels

In [None]:
processed_data,processed_label = ConceptNet_Sentences(processed_data,processed_label)

In [None]:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l[K     |██▋                             | 10 kB 24.6 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 28.5 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 26.8 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 19.8 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 14.6 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 11.0 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 11.8 MB/s eta 0:00:01[K     |████████████████████▉           | 81 kB 13.1 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 11.3 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 12.2 MB/s eta 0:00:01[K     |████████████████████████████▋   | 112 kB 12.2 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 12.2 MB/s eta 0:00:01[K     |████████████████████████████████| 125 

In [None]:
def add_Vader_Features(sentences):
  vader_features=[]
  for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    temp=[]
    temp.append(sentence)
    if vs['neg'] > 0.2:
      temp.append('negative')
    if vs['pos'] > 0.2:
      temp.append('positive')
    if vs['neu'] > 0.2:
      temp.append('neutral')
    if vs['compound'] > 0.2:
      temp.append('compound')
    temp=' '.join(temp)
    vader_features.append(temp)
  return vader_features

In [None]:
processed_data = add_Vader_Features(processed_data)

In [None]:
processed_data=np.asarray(processed_data)
Y=to_categorical(processed_label, num_classes=7)

**Preparing the model**

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
!pip install sentencepiece
import tokenization

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 23.4 MB/s eta 0:00:01[K     |▌                               | 20 kB 27.9 MB/s eta 0:00:01[K     |▉                               | 30 kB 14.7 MB/s eta 0:00:01[K     |█                               | 40 kB 11.1 MB/s eta 0:00:01[K     |█▍                              | 51 kB 11.0 MB/s eta 0:00:01[K     |█▋                              | 61 kB 10.7 MB/s eta 0:00:01[K     |██                              | 71 kB 10.2 MB/s eta 0:00:01[K     |██▏                             | 81 kB 11.3 MB/s eta 0:00:01[K     |██▍                             | 92 kB 12.4 MB/s eta 0:00:01[K     |██▊                             | 102 kB 11.8 MB/s eta 0:00:01[K     |███                             | 112 kB 11.8 MB/s eta 0:00:01[K     |███▎                            | 122 kB 11.8 MB/s eta 0:00:01[K     |██

In [None]:
def albert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = pooled_output
    out = Dense(100, activation='relu')(clf_output)
    out = Dense(100, activation='relu')(out)
    out = Dense(7, activation='softmax')(out)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
module_url = "https://tfhub.dev/tensorflow/albert_en_base/1"
albert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
sp_model_file = albert_layer.resolved_object.sp_model_file.asset_path.numpy()
tokenizer = tokenization.FullSentencePieceTokenizer(sp_model_file)
train_input = albert_encode(processed_data, tokenizer, max_len=160)
train_labels = Y

In [None]:
model = build_model(albert_layer, max_len=160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 11683584    input_word_ids[0][0]             
                                                                 input_mask[0][0]             

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [None]:
train_history = model.fit(
    train_input, train_labels,
    epochs=2,
    batch_size=1
)

Epoch 1/2
Epoch 2/2


**Preprocessing Test Data and passing through the model for accuracy**

In [None]:
for i in range(0,len(test_processed_data)):
  test_processed_data[i]=' '.join(test_processed_data[i])
test_processed_data=np.asarray(test_processed_data)
test_Y=to_categorical(test_processed_label, num_classes=7)

In [None]:
test_input = albert_encode(processed_data, tokenizer, max_len=160)
test_labels = processed_label

In [None]:
y_pred=model.predict(test_input, batch_size=1)

In [None]:
y_pred1=np.argmax(y_pred,axis=1)
test_processed_label=np.asarray(test_processed_label)
print("Accuracy : " + str(accuracy_score(test_processed_label, y_pred1)))
print("Weighted F1-score : " + str(f1_score(test_processed_label, y_pred1, average='weighted')))

Accuracy : 0.6214679802955665
Weighted F1-score : 0.5871660775699538
