# Sentiment Analysis Dense vs. LSTM
by Stefanie Müller

Data Source: https://www.kaggle.com/kazanova/sentiment140

Data description:
1. target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
2. ids: The id of the tweet ( 2087)
3. date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
4. flag: The query (lyx). If there is no query, then this value is NO_QUERY.
5. user: the user that tweeted (robotickilldozr)
6. text: the text of the tweet (Lyx is cool)

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras

In [2]:
data = pd.read_csv("training.1600000.processed.noemoticon.csv", sep=',', encoding="ISO-8859-1", header=None) 
data.columns = ["Sentiment", "ID", "Date", "Query", "User", "Text"]
data

Unnamed: 0,Sentiment,ID,Date,Query,User,Text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [3]:
data.dtypes

Sentiment     int64
ID            int64
Date         object
Query        object
User         object
Text         object
dtype: object

In [4]:
#Checking sentiments
data["Sentiment"].value_counts() #there are no neutral reviews!

0    800000
4    800000
Name: Sentiment, dtype: int64

In [5]:
#Specifying x and y
x = data.iloc[:,5].to_numpy()
y = data.iloc[:,0].to_numpy() / 4 #Normalizing sentiments

print(np.unique(y, return_counts = True))

(array([0., 1.]), array([800000, 800000], dtype=int64))


In [6]:
#Train-Test-Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)

print(np.unique(y_train, return_counts = True))

(array([0., 1.]), array([640000, 640000], dtype=int64))


In [7]:
#Specifying values
max_length = 25
sequence_length = 80
vocab_size = 50000
embedding_dim = 16

In [8]:
#Tokenizer removes all punctuations
tokenizer = keras.preprocessing.text.Tokenizer( 
    num_words=vocab_size,
    filters='"!#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', #blacklist
    lower=True, split=' ', 
    char_level=False, 
    oov_token=0,
    document_count=0,
    )

In [9]:
#Updating internal vocabulary
tokenizer.fit_on_texts(x_train)

In [10]:
#Amount of different words
print(len(tokenizer.word_index))

594368


In [11]:
#Transforming texts into sequences of integers
encoded_docs_xtrain = tokenizer.texts_to_sequences(x_train)
encoded_docs_xtest = tokenizer.texts_to_sequences(x_test)

padded_xtrain = keras.preprocessing.sequence.pad_sequences(encoded_docs_xtrain,
                                                           maxlen=max_length, padding='post')
padded_xtest = keras.preprocessing.sequence.pad_sequences(encoded_docs_xtest,
                                                          maxlen=max_length, padding='post')

In [12]:
#Deep Learning Models

def dense_model(model):
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(10, activation='relu'))
    return model

def lstm_model(model):
    model.add(keras.layers.LSTM(100,
                                activation='tanh',
                                dropout=0.1,
                                recurrent_dropout=0.1,
                                #input_shape=(sequence_length, 1),
                                return_sequences=True)) #must be True, if lstm follows

    model.add(keras.layers.LSTM(100, activation='tanh')) #no GlobalAveragePooling needed with this layer since it reduces the dimensions from 3 to 2
    model.add(keras.layers.Dense(7, activation='relu'))
    return model

def prepare_model(type):
    model = keras.models.Sequential()
    model.add(keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length))

    if type == 'dense':
        model = dense_model(model)
    elif type == 'lstm':
        model = lstm_model(model)

    model.add(keras.layers.Dense(1, activation='sigmoid'))

    model.summary()
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def fit_model(model, epochs=10, batch_size=4000, verbose=True):
    history = model.fit(padded_xtrain,
                        y_train,
                        #class_weight = class_weights,
                        epochs=epochs,
                        batch_size=batch_size,
                        verbose=verbose,
                        validation_data=(padded_xtest, y_test))
    return history

In [13]:
print("Training on GPU...") if tf.config.list_physical_devices('GPU') else print("Training on CPU...")

Training on GPU...


In [14]:
#Dense model preparation and fit
model_dense = prepare_model('dense')
history_dense = fit_model(model_dense, epochs=10, batch_size=4000)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 16)            800000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 10)                170       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 800,181
Trainable params: 800,181
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
#LSTM model preparation and fit
model_lstm = prepare_model('lstm')
history_lstm = fit_model(model_lstm, epochs=1, batch_size=1000)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 16)            800000    
_________________________________________________________________
lstm (LSTM)                  (None, 25, 100)           46800     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 707       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 8         
Total params: 927,915
Trainable params: 927,915
Non-trainable params: 0
_________________________________________________________________


In [16]:
#Dense accuracy
_, accuracy_dense = model_dense.evaluate(padded_xtest, y_test, verbose=0)
print(accuracy_dense)

0.804756224155426


In [17]:
#LSTM accuracy
_, accuracy_lstm = model_lstm.evaluate(padded_xtest, y_test, verbose=0)
print(accuracy_lstm)

0.8097187280654907


Comparison with others on kaggle shows that the data seems to give no better score than around 80% accuracy.

### Single Sentences Tests

In [18]:
#Positive Sentiment correctly identified:
x_pred = ["Best tweet ever!"]
encoded_docs_pred = tokenizer.texts_to_sequences(x_pred)

padded_docs_pred = keras.preprocessing.sequence.pad_sequences(
    encoded_docs_pred,
    maxlen=max_length,
    padding='post')

y_pred_dense = model_dense.predict(
    padded_docs_pred,
    batch_size=None,
    verbose=0,
    steps=None,
    callbacks=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False)

y_pred_lstm = model_lstm.predict(
    padded_docs_pred,
    batch_size=None,
    verbose=0,
    steps=None,
    callbacks=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False)

print(f'Accuracy Dense: {y_pred_dense} vs. LSTM: {y_pred_lstm}')

Accuracy Dense: [[0.94381714]] vs. LSTM: [[0.94938016]]


In [19]:
#Negative Sentiment correctly identified:
x_pred = ["Worst tweet ever!"]
encoded_docs_pred = tokenizer.texts_to_sequences(x_pred)

padded_docs_pred = keras.preprocessing.sequence.pad_sequences(
    encoded_docs_pred,
    maxlen=max_length,
    padding='post')

y_pred_dense = model_dense.predict(
    padded_docs_pred,
    batch_size=None,
    verbose=0,
    steps=None,
    callbacks=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False)

y_pred_lstm = model_lstm.predict(
    padded_docs_pred,
    batch_size=None,
    verbose=0,
    steps=None,
    callbacks=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False)

print(f'Accuracy Dense: {y_pred_dense} vs. LSTM: {y_pred_lstm}')

Accuracy Dense: [[0.10881341]] vs. LSTM: [[0.05100768]]


In [20]:
#Here the sentiment analysis does not work out. This is because of the word "least" is classified as a more positive than negative sentiment.
x_pred = ["My least favorite."]
encoded_docs_pred = tokenizer.texts_to_sequences(x_pred)

padded_docs_pred = keras.preprocessing.sequence.pad_sequences(
    encoded_docs_pred,
    maxlen=max_length,
    padding='post')

y_pred_dense = model_dense.predict(
    padded_docs_pred,
    batch_size=None,
    verbose=0,
    steps=None,
    callbacks=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False)

y_pred_lstm = model_lstm.predict(
    padded_docs_pred,
    batch_size=None,
    verbose=0,
    steps=None,
    callbacks=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False)

print(f'Accuracy Dense: {y_pred_dense} vs. LSTM: {y_pred_lstm}')

Accuracy Dense: [[0.83951706]] vs. LSTM: [[0.7635558]]


As we can see the LSTM-Net performs a little better which makes sense based on the fact that LSTM uses long/short-term memory.