# LSTM Model for classification of tweet sentiment

---

# 1. Installs and imports

## 1.1. Install all required libraries

In [1]:
# Uncomment line below to install all required libraries
# !pip3 install -r ../requirements.txt -q

## 1.2. Import required libraries

In [2]:
import pandas as pd
import numpy as np

from collections import Counter
import pickle

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import constant
from keras.optimizers import Adam

---

# 2. Load cleaned tweets dataset

In [3]:
df = pd.read_csv('./data/cleaned_tweets.csv')

In [4]:
df.head()

Unnamed: 0,sentiment,text,cleaned_tweet,Porter_Stem,Snowball_Stem
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",aww bummer shoulda got david carr third day,aww bummer shoulda got david carr third day,aww bummer shoulda got david carr third day
1,0,is upset that he can't update his Facebook by ...,upset cannot update facebook texting might cry...,upset can not updat facebook text might cri re...,upset can not updat facebook text might cri re...
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest go bounds,dive mani time ball manag save rest go bound,dive mani time ball manag save rest go bound
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole bodi feel itchi like fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....",behaving im mad cannot see,behav im mad can not see,behav im mad can not see


# 3. Drop text

In [5]:
df = df[['sentiment', 'Snowball_Stem']]

In [6]:
df.head()

Unnamed: 0,sentiment,Snowball_Stem
0,0,aww bummer shoulda got david carr third day
1,0,upset can not updat facebook text might cri re...
2,0,dive mani time ball manag save rest go bound
3,0,whole bodi feel itchi like fire
4,0,behav im mad can not see


# 4. Drop rows with NaN

In [7]:
df.isna().sum()

sentiment           0
Snowball_Stem    8046
dtype: int64

In [8]:
df = df.dropna()

In [9]:
df.isna().sum()

sentiment        0
Snowball_Stem    0
dtype: int64

---

# 5. Reduce dataframe size

In [10]:
df[df.sentiment != 0].shape

(795860, 2)

In [11]:
df[df.sentiment == 0].shape

(796094, 2)

In [12]:
# df[df.sentiment != 0][:200000].shape

In [13]:
# df[df.sentiment == 0][:200000].shape

In [14]:
# reduced_df = pd.concat([df[df.sentiment != 0][:200000], df[df.sentiment == 0][:200000]])

In [15]:
# reduced_df.shape

In [16]:
# df = reduced_df

---

# 5. Split dataset into training and test data

In [17]:
X = df['Snowball_Stem']

In [18]:
y = df['sentiment']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

- 75% training data
- 25% test data

In [20]:
X_train.shape

(1193965,)

In [21]:
y_train.shape

(1193965,)

In [22]:
X_test.shape

(397989,)

In [23]:
y_test.shape

(397989,)

---

# 6. Collection of all unique words in corpus

In [24]:
# Count of all unique words

def count_unique_words(tweets):
    unique = Counter()
    for tweet in tweets:
        for word in tweet.split():
            unique[word] += 1
    return unique


In [25]:
word_count = count_unique_words(X_train)

In [26]:
len(word_count)

179702

---

# 7. LSTM Model

## 7.1. Max number of words in a sequence

In [22]:
max_seq_length = 20

## 7.2. Create / Load tokenizer

In [28]:
# tokenizer = Tokenizer(num_words=len(word_count))

In [4]:
with open('./SavedModels/LSTM_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

## 7.3. Tokenize the text

In [30]:
tokenizer.fit_on_texts(X_train)

In [31]:
word_index = tokenizer.word_index

In [32]:
# Index for each word in tokenizer
len(word_index)

203576

## 7.4. Convert training data to tokenized sequences

In [36]:
X_train[3]

'whole bodi feel itchi like fire'

In [37]:
X_train_seq = tokenizer.texts_to_sequences(X_train)

In [38]:
X_train_seq[3]

[446, 1919, 303, 996, 283, 4421, 153, 4670, 4670, 33, 3798, 884]

## 7.5. Padding training sequences

In [39]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length, padding="post", truncating="post")

In [40]:
X_train_pad[3]

array([ 446, 1919,  303,  996,  283, 4421,  153, 4670, 4670,   33, 3798,
        884,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

## 7.6. Performing tokenization and padding for test set

In [41]:
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [42]:
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length, padding="post", truncating="post")

## 7.7. Understanding training and testing data

In [43]:
X_train_seq[0]

[240, 3204, 3204, 165, 396, 24, 532, 167]

In [44]:
X_train_pad[0]

array([ 240, 3204, 3204,  165,  396,   24,  532,  167,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [45]:
X_test_seq[0]

[25, 13, 38]

In [46]:
X_test_pad[0]

array([25, 13, 38,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0], dtype=int32)

In [47]:
X_train_pad.shape

(1193965, 20)

In [48]:
X_test_pad.shape

(397989, 20)

In [49]:
y_train.shape

(1193965,)

In [50]:
y_test.shape

(397989,)

## 7.8 Training the model

In [80]:
model = Sequential()

# Maps each word to a finite vector
model.add(Embedding(len(word_count), 20, input_length=max_seq_length))

model.add(LSTM(32, dropout=0.1))

model.add(Dense(1, activation="sigmoid"))

optimizer = Adam(learning_rate=0.0003)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])


In [81]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 20)            3594040   
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                6784      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 3,600,857
Trainable params: 3,600,857
Non-trainable params: 0
_________________________________________________________________


In [82]:
history = model.fit(X_train_pad, y_train, epochs=1, validation_data=(X_test_pad, y_test))



---

# 8. Evaluating model

## 8.1. Load Model

In [6]:
model = keras.models.load_model('./SavedModels/LSTM_train_75_val_78_test_79_acc.h5')

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 20)            3586000   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                6784      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 3,592,817
Trainable params: 3,592,817
Non-trainable params: 0
_________________________________________________________________


## 8.2. Create test dataset

In [8]:
test_df = pd.read_csv('./data/cleaned_tweets.csv')

In [9]:
# test_df = pd.concat([test_df[test_df.sentiment != 0][:100000], test_df[test_df.sentiment == 0][:100000]])

In [10]:
test_df.head()

Unnamed: 0,sentiment,text,cleaned_tweet,Porter_Stem,Snowball_Stem
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",aww bummer shoulda got david carr third day,aww bummer shoulda got david carr third day,aww bummer shoulda got david carr third day
1,0,is upset that he can't update his Facebook by ...,upset cannot update facebook texting might cry...,upset can not updat facebook text might cri re...,upset can not updat facebook text might cri re...
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest go bounds,dive mani time ball manag save rest go bound,dive mani time ball manag save rest go bound
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole bodi feel itchi like fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....",behaving im mad cannot see,behav im mad can not see,behav im mad can not see


In [11]:
test_df = test_df[['sentiment', 'Snowball_Stem']]

In [12]:
test_df.head()

Unnamed: 0,sentiment,Snowball_Stem
0,0,aww bummer shoulda got david carr third day
1,0,upset can not updat facebook text might cri re...
2,0,dive mani time ball manag save rest go bound
3,0,whole bodi feel itchi like fire
4,0,behav im mad can not see


## 8.3. Drop rows with NaN

In [13]:
test_df.isna().sum()

sentiment           0
Snowball_Stem    8046
dtype: int64

In [14]:
test_df = test_df.dropna()

In [15]:
test_df.isna().sum()

sentiment        0
Snowball_Stem    0
dtype: int64

## 8.4. Tokenization and padding

In [16]:
test_tweet = test_df['Snowball_Stem']

In [17]:
test_label = test_df['sentiment']

In [18]:
test_tweet.head()

0          aww bummer shoulda got david carr third day
1    upset can not updat facebook text might cri re...
2         dive mani time ball manag save rest go bound
3                      whole bodi feel itchi like fire
4                             behav im mad can not see
Name: Snowball_Stem, dtype: object

In [19]:
test_label.head()

0    0
1    0
2    0
3    0
4    0
Name: sentiment, dtype: int64

In [20]:
test_tweet_seq = tokenizer.texts_to_sequences(test_tweet)

In [23]:
test_tweet_pad = pad_sequences(test_tweet_seq, maxlen=max_seq_length, padding="post", truncating="post")

In [24]:
test_tweet_pad[0]

array([  98, 1014, 3008,   10,  696, 6269, 1600,    3,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

## 8.5. Evaluate

In [25]:
scores = model.evaluate(test_tweet_pad, test_label)



In [26]:
scores

[0.44509533047676086, 0.791164219379425]

In [27]:
loss, accuracy = scores

In [28]:
print("Loss on test set:", loss)
print("Accuracy achieve on test set:", accuracy)

Loss on test set: 0.44509533047676086
Accuracy achieve on test set: 0.791164219379425


---

# 9. Save model and tokenizer

In [29]:
model.save("./SavedModels/LSTM_train_75_val_78_test_79_acc.h5")

In [30]:
with open('./SavedModels/LSTM_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)