In [201]:
import numpy as np
import nltk
from nltk.stem.lancaster import LancasterStemmer
import json
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import csv
import pandas as pd
import re

In [4]:
def sigmoid(x):
    return (1/(1+np.exp(-x)))

print(sigmoid(np.array([1,2,22,4])))

[0.73105858 0.88079708 1.         0.98201379]


In [8]:
def softmax(x):
    exp_x = np.exp(x)
    return exp_x/exp_x.sum(axis=0)

print(softmax(np.array([2,3,4,5])))

[0.0320586  0.08714432 0.23688282 0.64391426]


In [202]:
def json_to_data(jsonfile, csvfile):
    file = open(jsonfile)
    data = json.load(file)
    
    f = csv.writer(open(csvfile, 'w'))
    f.writerow(["emotion","utterance"])
    
    for item in data:
        for line in item:
            f.writerow([line['emotion'],line['utterance']])
       

In [203]:
json_to_data('Project/DataSets/friends_train.json', 'Project/DataSets/textData.csv')

In [259]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
regex = re.compile('[^a-zA-Z\' ]')

def cleaner(string):
    #string = regex.sub('',string)
    string = string.translate({ord(i): None for i in '!.,&%#@?;\\'})
    string = string.replace('-', ' ')
    try:
        w = string.lower().split()
    except:
        return "NAW"
    w = [p.lower() for p in w]
    
    #w = [lemmatizer.lemmatize(word) for word in w]
    string = " ".join(w)
    return string


In [260]:
def prepare_data(csvfile):
    file = pd.read_csv(csvfile)
    cols = ['emotion', 'utterance']
    file = file[cols]
    file['emotion_id'] = file['emotion'].factorize()[0]
    file['utterance'] = [cleaner(sent) for sent in file['utterance']]
    file = file[pd.notnull(file['utterance'])]
    return file['utterance'], file['emotion_id']

In [261]:
X,Y = prepare_data('Project/DataSets/textData.csv')
print(X, Y)


0        also i was the point person on my companys tr...
1                          you mustve had your hands full
2                                    that i did that i did
3             so lets talk a little bit about your duties
4                                      my duties all right
5        now youll be heading a whole division so you...
6                                                    i see
7        but therell be perhaps 30 people under you so...
8                                             good to know
9                                    we can go into detail
10                                   no dont i beg of you
11       all right then well have a definite answer fo...
12                                                  really
13                  absolutely you can relax you did great
14       but then who the waitress i went out with last...
15                                      you know forget it
16           no no no no no who who were you talking abo

In [262]:
print(zip(X, Y))
f = csv.writer(open("Project/DataSets/ProcessedData.csv", 'w'))
f.writerow(["emotion","utterance"])
    
for item in zip(Y,X):
    f.writerow(item)

<zip object at 0x7f5ccc849548>


In [263]:
def read_glove_vecs(embed):
    with open(embed, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            word = line[0]
            words.add(word)
            word_to_vec_map[word] = np.array(line[1:], dtype=np.float64)
            
        i=1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i+=1
        return words_to_index, index_to_words, word_to_vec_map
            

In [264]:
words_to_index, index_to_words, word_to_vec_map = read_glove_vecs('Project/DataSets/glove.6B.50d.txt')

In [265]:
maxi = 0
for p in X:
    w = p.lower().split()
    j=0
    for i in w:
        try:
            a = word_to_vec_map[i]
        except KeyError:
            continue
        j+=1
    maxi = max(j,maxi)

print(maxi)

66


In [277]:
def get_X_mat(X):
    Xmat = np.zeros((X.shape[0],50,50))
    for i in range(X.shape[0]):
        try:
            words = X[i].lower().split()
        except:
            continue
        j=0
        for word in words:
            try:
                Xmat[i,j,:] = word_to_vec_map[word]
            except KeyError:
                continue
            j+=1
            if j>49:
                break
    return Xmat  
    

In [278]:
Xmat = get_X_mat(X)
print(Xmat[0].shape)

(50, 50)


In [316]:
from keras.models import Model
from keras.layers import Dense, RNN, Input, Dropout, LSTM, Activation
from keras.utils import to_categorical
from keras.optimizers import Adam

In [279]:
Ymat = to_categorical(Y)
print(Ymat.shape)


(10561, 8)


In [307]:
def RNN_Model_Keras(input_shape):
    inputs = Input(shape=(input_shape), dtype=np.float32)
    X = LSTM(64, return_sequences=True)(inputs)
    X = LSTM(128)(X)
    X = Dense(8, activation='softmax')(X)
    model = Model(inputs=inputs, outputs=X)
    return model

In [308]:
model = RNN_Model_Keras((50,50,))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_27 (InputLayer)        (None, 50, 50)            0         
_________________________________________________________________
lstm_32 (LSTM)               (None, 50, 64)            29440     
_________________________________________________________________
lstm_33 (LSTM)               (None, 128)               98816     
_________________________________________________________________
dense_25 (Dense)             (None, 8)                 1032      
Total params: 129,288
Trainable params: 129,288
Non-trainable params: 0
_________________________________________________________________


In [328]:
ad = Adam(lr=0.005)
model.compile(loss='categorical_crossentropy', optimizer=ad, metrics=['accuracy'])

In [None]:
model.fit(Xmat, Ymat, epochs = 25, batch_size = 5, shuffle=True)

Epoch 1/25
  830/10561 [=>............................] - ETA: 2:45 - loss: nan - acc: 0.4819

In [184]:
print(model.predict(Xmat))   

[[0.44130793 0.10453085 0.01645049 ... 0.0305404  0.04619681 0.02420069]
 [0.48662308 0.13667335 0.01035829 ... 0.01666061 0.03554733 0.01365443]
 [0.45319232 0.20871767 0.01034796 ... 0.0116654  0.03708338 0.01165923]
 ...
 [0.42775327 0.09724195 0.0181008  ... 0.0355427  0.04894298 0.02791518]
 [0.47427708 0.17781249 0.00969653 ... 0.01306018 0.03510025 0.01210626]
 [0.46434876 0.12244167 0.01315355 ... 0.0222768  0.04091574 0.0179424 ]]


In [299]:
dev_file = pd.read_csv('Project/DataSets/CompProcessedDataDev.csv')
cols = ['emotion', 'utterance']
dev_file = dev_file[pd.notnull(dev_file['utterance'])]
X_test = dev_file['utterance']
Y_test = dev_file['emotion']

In [300]:
print(type(X[0]))
Xmat_test = get_X_mat(X_test)
print(Xmat_test.shape)
Ymat_test = to_categorical(Y_test)
print(Ymat_test.shape)

<class 'str'>
(972, 50, 50)
(972, 8)


In [306]:
loss, acc = model.evaluate(Xmat_test, Ymat_test)
print(loss)
print(acc)

3.3514347164719194
0.33436214004033876
