**Required Installs for the project**

In [None]:
!pip install vaderSentiment
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
!pip install sentencepiece

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l[K     |██▋                             | 10 kB 32.2 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 18.4 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 15.2 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 13.9 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 5.5 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 6.4 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 7.4 MB/s eta 0:00:01[K     |████████████████████▉           | 81 kB 7.4 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 6.1 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 6.7 MB/s eta 0:00:01[K     |████████████████████████████▋   | 112 kB 6.7 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 6.7 MB/s eta 0:00:01[K     |████████████████████████████████| 125 kB 6.7 M

**Required Imports for the project**

In [None]:
import pickle
import string
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import numpy as np
from keras.utils.np_utils import to_categorical  
import tensorflow_hub as hub
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import jaccard_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import tokenization

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Loading the training and test data**

**Importing the Dataset.** (Replace the file location with your local path to the Dataset)

In [None]:
with open('/content/gdrive/My Drive/Dataset/iemocap/train/sentences.pkl', 'rb') as f:
    data = pickle.load(f)
with open('/content/gdrive/My Drive/Dataset/iemocap/train/labels.pkl', 'rb') as f:
    labels = pickle.load(f)
with open('/content/gdrive/My Drive/Dataset/iemocap/test/sentences.pkl', 'rb') as f:
    test_data = pickle.load(f)
with open('/content/gdrive/My Drive/Dataset/iemocap/test/labels.pkl', 'rb') as f:
    test_labels = pickle.load(f)
with open('/content/gdrive/MyDrive/Dataset/iemocap/train/conversation_length.pkl', 'rb') as f:
    train_convlen = pickle.load(f)
with open('/content/gdrive/MyDrive/Dataset/iemocap/test/conversation_length.pkl', 'rb') as f:
    test_convlen = pickle.load(f)

**Preprocessing the Training Data**

Function to arrange sentences and create an array of string of sentences

In [None]:
def arrange_sentences(data,data2,data3):
  startlen=0
  final_sentences=[]
  final_labels=[]
  for i in data:
    odd_sentences=[]
    odd_labels=[]
    for j in range(0,i):
      if j%2==0:
        final_sentences.append(data2[startlen])
        final_labels.append(data3[startlen])
        startlen=startlen+1
      else:
        odd_sentences.append(data2[startlen])
        odd_labels.append(data3[startlen])
        startlen=startlen+1
    for i in range(0,len(odd_sentences)):
      final_sentences.append(odd_sentences[i])
      final_labels.append(odd_labels[i])
  return final_sentences,final_labels

def preprocessing(data,labels):
  processed_data=[]
  processed_label=[]
  for i in range(0,len(data)):
    for j in range(0,len(data[i])):
      intermediate_data=[]
      intermediate_label=[]
      for k in range(0,len(data[i][j])):
        text=data[i][j][k]
        if text != '<eos>'and text!='<pad>':
          intermediate_data.append(text)
      processed_data.append(intermediate_data)
  for i in labels:
    for j in i:
      processed_label.append(j)
  return processed_data,processed_label

In [None]:
processed_data,processed_label = preprocessing(data,labels)
for i in range(0,len(processed_data)):
  processed_data[i]= ' '.join(processed_data[i])
processed_data,processed_label=arrange_sentences(train_convlen,processed_data,processed_label)

**Overview of the training data**

In [None]:
print("Total number of utterances in the training data : " + str(len(processed_data)))
print()
print("First 10 utterances : ")
for i in range(0,10):
  print(processed_data[i])
print()
print("Total number of conversations in the training data : " + str(len(train_convlen)))

Total number of utterances in the training data : 4699

First 10 utterances : 
thank you for calling sprint . we care about everybody . how can i help you ?
are you sure you did n't make them ?
can i get your phone number , please ?
i 'm going to need it again . i need to look at your file . if you want me to look at your file
okay .
i am seeing the two hundred dollar charge .
well i 'm looking at these .
so can you please refund the mischarged charges
i did not make the calls from b.f.e . nebraska .
so could you discretion it back ?

Total number of conversations in the training data : 96


**Adding ConceptNet Features**

In [None]:
def ConceptNet_Sentences(data,labels):
  conceptnet_data=[]
  conceptnet_labels=[]
  for sentences in range(0,len(data)):
      current_sentence=data[sentences]
      conceptnet_data.append(current_sentence)
      conceptnet_labels.append(labels[sentences])
      cleaned_sentences=data[sentences].translate(str.maketrans('', '', string.punctuation))
      tokens_with_sw=nltk.word_tokenize(cleaned_sentences)
      tokens= [word for word in tokens_with_sw if not word in stopwords.words()]
      for i in range(1,3):
        temp=current_sentence
        for j in tokens:
          try:
            obj = requests.get('http://api.conceptnet.io/related/c/en/' + j + '?filter=/c/en').json()
            response=obj['related'][i]['@id']
          except Exception:
            continue
          response=re.sub(r'[^\w]', ' ', response)
          response=response[6:]
          temp=temp.replace(j,response)
        conceptnet_data.append(temp)
        conceptnet_labels.append(labels[sentences])
  return conceptnet_data,conceptnet_labels

In [None]:
processed_data,processed_label = ConceptNet_Sentences(processed_data,processed_label)

**Adding Contextual Info from VADER**

In [None]:
analyzer = SentimentIntensityAnalyzer()

Function to add VADER features

In [None]:
def add_Vader_Features(sentences):
  vader_features=[]
  for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    temp=[]
    temp.append(sentence)
    if vs['neg'] > 0.2:
      temp.append('negative')
    if vs['pos'] > 0.2:
      temp.append('positive')
    if vs['neu'] > 0.2:
      temp.append('neutral')
    if vs['compound'] > 0.2:
      temp.append('compound')
    temp=' '.join(temp)
    vader_features.append(temp)
  return vader_features

In [None]:
processed_data = add_Vader_Features(processed_data)

Converting data for making it possible to pass through the model

In [None]:
processed_data=np.asarray(processed_data)
Y=to_categorical(processed_label, num_classes=6)

**Preparing the Model**

In [None]:
def albert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = pooled_output
    out = Dense(100, activation='relu')(clf_output)
    out = Dense(100, activation='relu')(out)
    out = Dense(6, activation='softmax')(out)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
module_url = "https://tfhub.dev/tensorflow/albert_en_base/1"
albert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
sp_model_file = albert_layer.resolved_object.sp_model_file.asset_path.numpy()
tokenizer = tokenization.FullSentencePieceTokenizer(sp_model_file)
train_input = albert_encode(processed_data, tokenizer, max_len=160)
train_labels = Y

In [None]:
model = build_model(albert_layer, max_len=160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 11683584    input_word_ids[0][0]             
                                                                 input_mask[0][0]             

  "The `lr` argument is deprecated, use `learning_rate` instead.")


**Training with data**

In [None]:
train_history = model.fit(
    train_input, train_labels,
    epochs=4,
    batch_size=1
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


**Preprocessing Test Data and passing through the model for accuracy**

In [None]:
test_processed_data,test_processed_label = preprocessing(test_data,test_labels)
for i in range(0,len(test_processed_data)):
  test_processed_data[i]=' '.join(test_processed_data[i])
test_processed_data=np.asarray(test_processed_data)
test_Y=to_categorical(test_processed_label, num_classes=6)

In [None]:
print("Total number of utterances in the testing data : " + str(len(test_processed_data)))
print()
print("First 10 utterances : ")
for i in range(0,10):
  print(test_processed_data[i])
print()
print("Total number of conversations in the training data : " + str(len(test_convlen)))

Total number of utterances in the testing data : 1624

First 10 utterances : 
what 's he going to say ? maybe we should tell him before he sees it .
when ? i was the first one up . he was still in bed . how could he have seen it ?
did you talk to him ?
no . i figured it was best to leave him alone .
he cried hard ?
you know that larry 's not coming back and i know it . so why do we go on letting him think that we believe with him ?
what do you want to do , argue with him ?
i do n't want to argue with him . but it 's time he knows that nobody else thinks that larry 's alive anymore .
why should he dream about him , walking around looking for him at night ? do we contradict him ? do we say straight out that we do n't
we have to say it to him .

Total number of conversations in the training data : 31


In [None]:
test_input = albert_encode(test_processed_data, tokenizer, max_len=160)
test_labels = test_Y

In [None]:
y_pred=model.predict(test_input, batch_size=1)

In [None]:
y_pred1=np.argmax(y_pred,axis=1)
test_processed_label=np.asarray(test_processed_label)
print("Accuracy : " + str(accuracy_score(test_processed_label, y_pred1)))
print("Weighted F1-score : " + str(f1_score(test_processed_label, y_pred1, average='weighted')))

Accuracy : 0.6514679802955665
Weighted F1-score : 0.6521660775699538


**Enhanced Results**

In [None]:
print("Precision : " + str(precision_score(test_processed_label, y_pred1, average='weighted')))

Precision : 0.6504026129602586


In [None]:
print("Confusion Matrix : ")
confusion_matrix(test_processed_label,y_pred1)

Confusion Matrix : 


array([[ 52,   8,   8,   0,  12,  11],
       [  2,  95,  12,   5,   1,  42],
       [  5,  27, 169,   6,   3,  98],
       [  1,   3,  10, 183,   2,  71],
       [ 34,  23,  43,   8, 164,  58],
       [  6,  20,  36,  36,   1, 369]])

In [None]:
print("Jaccard Score : ")
jaccard_score(test_processed_label,y_pred1,average='weighted')

Jaccard Score : 


0.4676787148799757