### Template for NLP project

The aim of the project is to achieve the following:
 - Train a neural network that is **at least better than random guessing** on your dataset. The template contains the IMDB dataset for sentiment analysis, however, you can choose any other language related data set with the appropriate NLP task.
 - Investigate different neural network architectures (different hyperparameters, different layers, different pre-processing). Explain in the presentation, why the final network was selected! **Do not rely on black-box mechanisms.**
 

In [3]:
# tensorflow modules
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, LayerNormalization, LSTM
import tensorflow

# if you have installed a different version, replace 'r2.6'  with your version in links provided below
print(tensorflow.__version__)

2.9.0


In [4]:
# load imdb dataset
# links to dataset
# original dataset: https://ai.stanford.edu/~amaas/data/sentiment/
# version in tensorflow: https://www.tensorflow.org/versions/r2.9/api_docs/python/tf/keras/datasets/imdb

# select your vocabulary size
vocabularySize = 25000
# load data (it is already pre-processed)
# optional: add other pre.processing steps like stopword removal
(xTrain, yTrain), (xTest, yTest) = imdb.load_data(num_words=vocabularySize)
print('Loaded dataset with {} training samples, {} test samples'.format(len(xTrain), len(xTest)))

# look at the respective words
word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
print('---review with words---')
print([id2word.get(i, ' ') for i in xTrain[123]])

Loaded dataset with 25000 training samples, 25000 test samples
---review with words---
['the', 'version', 'to', 'date', 'on', 'list', 'draw', 'him', 'critical', 'very', 'love', 'to', 'by', 'br', 'of', 'its', 'tony', 'characters', 'was', 'one', 'life', 'this', 'is', 'go', 'was', 'best', 'least', 'should', 'so', 'done', 'result', 'no', 'was', 'with', 'this', 'understood', 'only', 'war', "couldn't", 'that', 'her', 'get', 'would', 'johnny', 'we', 'in', 'tighter', 'are', 'to', 'business', 'that', 'her', 'because', 'story', 'use', 'movies']


In [8]:
# Import module
from nrclex import NRCLex

# Get all Words with 'no emotes'
noEmoteWords = dict()
for word in id2word:
    emotion = NRCLex(id2word[word])
    if len(emotion.affect_list) == 0:
        noEmoteWords[word]=id2word[word]

print('All words in IMDB dataset:', len(id2word))
print('Words without emotes in IMDB dataset:', len(noEmoteWords))
print('Example - \'love\':', NRCLex('love').affect_list)
print('Example - \'not\':', NRCLex('not').affect_list)

All words in IMDB dataset: 88584
Words without emotes in IMDB dataset: 81397
Example - 'love': ['joy', 'positive']
Example - 'not': []


In [63]:
# Delete all 'No Emote Words' from sentence
def NullNoEmoteWords(X, noEmoteWords):
    for idx in range(len(X)):
        if X[idx] in noEmoteWords:
            X[idx] = 0
    X = [s for s in X if s != 0]
    return X

# Delete all 'No Emote Words' form data
def OnlyEmoteWords(X, noEmoteWords):
    for idx in range(len(X)):
        X[idx] = NullNoEmoteWords(X[idx], noEmoteWords)
    return X

# Delete 'No Emote Words'
xTrain = OnlyEmoteWords(xTrain, noEmoteWords.keys())
xTest = OnlyEmoteWords(xTest, noEmoteWords.keys())

In [64]:
# select maximum number of words as input lengt
# pad or truncated (this is done automatically) your data
maxWords = 200
xTrain = sequence.pad_sequences(xTrain, maxlen=maxWords)
xTest = sequence.pad_sequences(xTest, maxlen=maxWords)

In [65]:
## setup the neural network architecture
model=Sequential()
 
embeddingSize = 4
model.add(Embedding(vocabularySize, embeddingSize, embeddings_initializer='HeNormal', input_length=maxWords))

# add layers: 

# model.add(SimpleRNN(100))
model.add(LSTM(4))
model.add(LayerNormalization())

# add layer for output
model.add(Dense(1, activation='sigmoid'))

# print model and check number of parameters
print(model.summary())

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 200, 4)            100000    
                                                                 
 lstm_7 (LSTM)               (None, 4)                 144       
                                                                 
 layer_normalization_7 (Laye  (None, 4)                8         
 rNormalization)                                                 
                                                                 
 dense_7 (Dense)             (None, 1)                 5         
                                                                 
Total params: 100,157
Trainable params: 100,157
Non-trainable params: 0
_________________________________________________________________
None


In [66]:
# set parameters for network training
batchSize = 64
numEpochs = 10

# train your model
model.compile(loss='binary_crossentropy',  optimizer='adam', metrics=['accuracy'])
xValid, yValid = xTrain[:batchSize], yTrain[:batchSize]
xTrain2, yTrain2 = xTrain[batchSize:], yTrain[batchSize:]
hist = model.fit(xTrain2, yTrain2, validation_data=(xValid, yValid), batch_size=batchSize, epochs=numEpochs)

# check result
scores = model.evaluate(xTest, yTest, verbose=0)
print('Test accuracy:', scores[1])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.7497199773788452


In [None]:
#model.save('FinalResult')