In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Embedding, Dense, LSTM, Bidirectional
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# reading train text data from file
fl = open('/content/drive/My Drive/Sample Data/train.txt','r')
train_txt = fl.read()
train_txt = train_txt.split('\n')
train_lbl=[]
train_ex=[]
lbl_dict={}
train_len=len(train_txt)
i=0

# spliting data into training example and training label
while i<train_len:
  word=train_txt[i].split('\"',2)
  train_lbl.append(word[1])
  train_ex+=(word[2:])
  i=i+1

# dictionarize the training labels as{'বিতৃষ্ণা': 1, 'বিষণ্ণতা': 0, 'বিস্ময়': 2, 'ভয়': 3, 'রাগ': 5, 'সুখী': 4}
lbl_dict = {
  'বিতৃষ্ণা': 0,
  'বিষণ্ণতা': 1,
  'বিস্ময়': 2,
  'ভয়': 3, 
  'রাগ': 4, 
  'সুখী': 5
}

# initilizing the train_lbl_in for training purpose in a two dimensional list with occurance
import random
train_lbl_int = []
i=0
while i<train_len:
  train_lbl_int.append([0,0,0,0,0,0])
  i+=1
i=0
while i<train_len:
  j=0
  while j<6:
    if j==lbl_dict[train_lbl[i]]:
      train_lbl_int[i][j]=1
    j+=1
  i+=1
train_lbl_int = np.array(train_lbl_int)



In [None]:
# reading dev text data from file
fl = open('/content/drive/My Drive/Sample Data/dev.txt','r')
test_txt = fl.read()
test_txt = test_txt.split('\n')
test_lbl=[]
test_ex=[]
test_len=len(test_txt)
i=0

# spliting data into testing example and testing label
while i<test_len:
  word=test_txt[i].split('\"',2)
  test_lbl.append(word[1])
  test_ex+=(word[2:])
  i=i+1

# initilizing the test_lbl_in for accuracy purpose in a two dimensional list with occurance
test_lbl_int = []
i=0
while i<test_len:
  test_lbl_int.append([0,0,0,0,0,0])
  i+=1
i=0
while i<test_len:
  j=0
  while j<6:
    if j==lbl_dict[test_lbl[i]]:
      test_lbl_int[i][j]=1
    j+=1
  i+=1
test_lbl_int = np.array(test_lbl_int)


In [None]:
# tokenization and converting training words into sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_ex)
dense_train_ex = tokenizer.texts_to_sequences(train_ex)

# tokenization and converting training words into sequences
dense_test_ex = tokenizer.texts_to_sequences(test_ex)


In [None]:
# padding the training documents in order to make them equal length
SUM_TRAIN=0
COUNT_TRAIN=0
MAX_VOCAB=0
for dtr in dense_train_ex:
  SUM_TRAIN += len(dtr)
  COUNT_TRAIN +=1
  for token_size in dtr:
    MAX_VOCAB=max(MAX_VOCAB,token_size)
AVRGE_TRAIN = SUM_TRAIN//COUNT_TRAIN
padded_train_ex = pad_sequences(dense_train_ex, maxlen=AVRGE_TRAIN+10, padding='post')

# padding the training documents in order to make them equal length
SUM_TEST=0
COUNT_TEST=0
for dtr in dense_test_ex:
  SUM_TEST += len(dtr)
  COUNT_TEST +=1
AVRGE_TEST = SUM_TEST//COUNT_TEST
padded_test_ex = pad_sequences(dense_test_ex, maxlen=AVRGE_TRAIN+10, padding='post')



In [None]:
# Model Declaration
model = Sequential()

# Embedding Layer
embedding_layer = Embedding(input_dim=MAX_VOCAB+1, output_dim=20, input_length=AVRGE_TRAIN+10)
model.add(embedding_layer)

# LSTM - for better performance
model.add(LSTM(units=128,return_sequences=False))

# Output Layer
model.add(Dense(units=50, activation='swish'))
model.add(Dense(units=20, activation='swish'))
model.add(Dense(units=6, activation='swish'))



In [None]:
# model initialization
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='mse', metrics=['accuracy'])
print(model.summary())


Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 27, 20)            345240    
_________________________________________________________________
lstm_19 (LSTM)               (None, 128)               76288     
_________________________________________________________________
dense_57 (Dense)             (None, 50)                6450      
_________________________________________________________________
dense_58 (Dense)             (None, 20)                1020      
_________________________________________________________________
dense_59 (Dense)             (None, 6)                 126       
Total params: 429,124
Trainable params: 429,124
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# Training the model
model.fit(padded_train_ex, train_lbl_int,validation_data=(padded_test_ex, test_lbl_int), epochs=20, verbose=1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f91f4294f90>

In [None]:
#collecting prediction and reshaping
prediction = model.predict(padded_test_ex)
prediction = np.hstack(prediction)
prediction = prediction.astype(int)

#Gold label reshaping 
test_lbl_int = np.hstack(test_lbl_int)


In [None]:
# Calculating Accuracy
from sklearn.metrics import accuracy_score
acc = accuracy_score(prediction,test_lbl_int)
print("Accuracy = ",round(acc*100),"%")

Accuracy =  85 %


In [None]:
from sklearn.metrics import f1_score
f1score=f1_score(prediction,test_lbl_int)
print("f1 score = ",round(f1score*100),"%")

f1 score =  37 %
