# LSTM Model for classification of tweet sentiment

---

# 1. Installs and imports

## 1.1. Install all required libraries

In [1]:
# Uncomment line below to install all required libraries
# !pip3 install -r ../requirements.txt -q

## 1.2. Import required libraries

In [2]:
import pandas as pd
import numpy as np

from collections import Counter
import pickle

from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.metrics import precision_recall_fscore_support

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import constant
from keras.optimizers import Adam

import gensim

---

# 2. Load cleaned tweets dataset

In [3]:
df = pd.read_csv('./data/cleaned_tweets.csv')

In [4]:
df.head()

Unnamed: 0,sentiment,text,cleaned_tweet,Porter_Stem,Snowball_Stem
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",aww bummer shoulda got david carr third day,aww bummer shoulda got david carr third day,aww bummer shoulda got david carr third day
1,0,is upset that he can't update his Facebook by ...,upset cannot update facebook texting might cry...,upset can not updat facebook text might cri re...,upset can not updat facebook text might cri re...
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest go bounds,dive mani time ball manag save rest go bound,dive mani time ball manag save rest go bound
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole bodi feel itchi like fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....",behaving im mad cannot see,behav im mad can not see,behav im mad can not see


# 3. Drop text

In [5]:
df = df[['sentiment', 'Snowball_Stem']]

In [6]:
df.head()

Unnamed: 0,sentiment,Snowball_Stem
0,0,aww bummer shoulda got david carr third day
1,0,upset can not updat facebook text might cri re...
2,0,dive mani time ball manag save rest go bound
3,0,whole bodi feel itchi like fire
4,0,behav im mad can not see


# 4. Drop rows with NaN

In [7]:
df.isna().sum()

sentiment           0
Snowball_Stem    8046
dtype: int64

In [8]:
df = df.dropna()

In [9]:
df.isna().sum()

sentiment        0
Snowball_Stem    0
dtype: int64

---

# 5. Split dataset into training and test data

In [10]:
X = df['Snowball_Stem']

In [11]:
y = df['sentiment']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

- 80% training data
- 20% test data

In [17]:
X_train.shape

(1273563,)

In [18]:
y_train.shape

(1273563,)

In [19]:
X_test.shape

(318391,)

In [20]:
y_test.shape

(318391,)

---

# 6. Collection of all unique words in corpus

In [18]:
# Count of all unique words

def count_unique_words(tweets):
    unique = Counter()
    for tweet in tweets:
        for word in tweet.split():
            unique[word] += 1
    return unique


In [19]:
word_count = count_unique_words(X_train)

In [20]:
# Finding Vocabulary size
len(word_count)

186771

---

# 7. LSTM Model

## 7.1. Max number of words in a sequence

In [35]:
max_seq_length = 20

## 7.2. Create / Load tokenizer

In [36]:
# tokenizer = Tokenizer(num_words=len(word_count))

In [22]:
with open('./SavedModels/BLSTM_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

## 7.3. Tokenize the text

In [24]:
tokenizer.fit_on_texts(X_train)

In [25]:
word_index = tokenizer.word_index

In [26]:
# Index for each word in tokenizer
len(word_index)

205208

## 7.4. Convert training data to tokenized sequences

In [27]:
X_train[3]

'whole bodi feel itchi like fire'

In [28]:
X_train_seq = tokenizer.texts_to_sequences(X_train)

In [29]:
X_train_seq[3]

[215, 6516, 323, 43, 2, 1144, 2019, 24001, 606, 3, 79, 76, 549]

## 7.5. Padding training sequences

In [30]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length, padding="post", truncating="post")

In [31]:
X_train_pad[3]

array([  215,  6516,   323,    43,     2,  1144,  2019, 24001,   606,
           3,    79,    76,   549,     0,     0,     0,     0,     0,
           0,     0], dtype=int32)

## 7.6. Performing tokenization and padding for test set

In [32]:
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [33]:
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length, padding="post", truncating="post")

## 7.7. Understanding training and testing data

In [34]:
X_train_seq[0]

[48,
 287,
 1034,
 544,
 985,
 2194,
 978,
 985,
 563,
 543,
 755,
 469,
 37,
 22,
 63,
 1,
 4865]

In [35]:
X_train_pad[0]

array([  48,  287, 1034,  544,  985, 2194,  978,  985,  563,  543,  755,
        469,   37,   22,   63,    1, 4865,    0,    0,    0], dtype=int32)

In [36]:
X_test_seq[0]

[17005, 1608, 884, 312, 57, 1587, 375, 37, 1046, 16, 875, 63, 2273]

In [37]:
X_test_pad[0]

array([17005,  1608,   884,   312,    57,  1587,   375,    37,  1046,
          16,   875,    63,  2273,     0,     0,     0,     0,     0,
           0,     0], dtype=int32)

In [38]:
X_train_pad.shape

(1273563, 20)

In [39]:
X_test_pad.shape

(318391, 20)

In [40]:
y_train.shape

(1273563,)

In [41]:
y_test.shape

(318391,)

## 7.8 Training the model

In [None]:
model = Sequential()

# Maps each word to a finite vector
model.add(Embedding(len(word_count), 20, input_length=max_seq_length))

model.add(LSTM(32, dropout=0.1))

model.add(Dense(1, activation="sigmoid"))

optimizer = Adam(learning_rate=0.0003)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])


In [None]:
model.summary()

In [None]:
history = model.fit(X_train_pad, y_train, epochs=1, validation_data=(X_test_pad, y_test))

# 7.9. Training the model (GloVe)

In [44]:
embeddings_index = dict()
f = open('./SavedModels/glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [45]:
embedding_matrix = np.zeros((len(word_index)+1, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [47]:
model = Sequential()

# Maps each word to a finite vector

model.add(Embedding(len(word_index)+1, 100, weights=[embedding_matrix], input_length=max_seq_length, trainable=False))

# model.add(Bidirectional(LSTM(64, dropout = 0.2, return_sequences = True)))

model.add(LSTM(64, dropout = 0.2))

model.add(Dense(1, activation="sigmoid"))

optimizer = Adam(learning_rate=0.0001)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])


In [48]:
history = model.fit(X_train_pad, y_train, epochs=3, validation_data=(X_test_pad, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [49]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 100)           20520900  
_________________________________________________________________
lstm (LSTM)                  (None, 64)                42240     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 20,563,205
Trainable params: 42,305
Non-trainable params: 20,520,900
_________________________________________________________________


In [50]:
model.save("./SavedModels/LSTM_GloVe_train_73_83_val_75_2_test_?_acc_epoch_3.h5")

# 7.10 Word2Vec embedding

In [19]:
word2vec_train = list(map(lambda x: x.split(),list(X)))

In [20]:
word2vec = gensim.models.Word2Vec(word2vec_train, min_count=5)

In [21]:
n = len(word2vec.wv.vectors)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(word2vec.wv.vectors, list(y)[:n//2] + list(y)[-n//2-1000:-1000],test_size=0.2)

In [25]:
X_train.shape

(31127, 100)

In [28]:
weights = word2vec.wv.syn0

  weights = word2vec.wv.syn0


In [31]:
vocab_size, embedding_size = weights.shape

In [33]:
vocab_size, embedding_size

(38909, 100)

In [46]:
model = Sequential()

model.add(Embedding(38909, 100, weights=[weights], input_length=100, trainable=False))

model.add(LSTM(100, dropout = 0.2))

model.add(Dense(1, activation="sigmoid"))

optimizer = Adam(learning_rate=0.0001)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])

In [47]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          3890900   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 3,971,401
Trainable params: 80,501
Non-trainable params: 3,890,900
_________________________________________________________________


In [48]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
history = model.fit(X_train+10, y_train, epochs=10, validation_data=(X_test+10, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

---

# 8. Evaluating model

## 8.1. Load Model

In [5]:
model = keras.models.load_model('./SavedModels/LSTM_GloVe_train_73_83_val_75_2_test_?_acc_epoch_3.h5')

In [6]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 100)           20520900  
_________________________________________________________________
lstm (LSTM)                  (None, 64)                42240     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 20,563,205
Trainable params: 42,305
Non-trainable params: 20,520,900
_________________________________________________________________


## 8.2. Create test dataset

In [7]:
test_df = pd.read_csv('./data/cleaned_tweets.csv')

In [54]:
# test_df = pd.concat([test_df[test_df.sentiment != 0][:100000], test_df[test_df.sentiment == 0][:100000]])

In [8]:
test_df.head()

Unnamed: 0,sentiment,text,cleaned_tweet,Porter_Stem,Snowball_Stem
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",aww bummer shoulda got david carr third day,aww bummer shoulda got david carr third day,aww bummer shoulda got david carr third day
1,0,is upset that he can't update his Facebook by ...,upset cannot update facebook texting might cry...,upset can not updat facebook text might cri re...,upset can not updat facebook text might cri re...
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest go bounds,dive mani time ball manag save rest go bound,dive mani time ball manag save rest go bound
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole bodi feel itchi like fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....",behaving im mad cannot see,behav im mad can not see,behav im mad can not see


In [9]:
test_df = test_df[['sentiment', 'Snowball_Stem']]

In [23]:
test_df.head()

Unnamed: 0,sentiment,Snowball_Stem
0,0,aww bummer shoulda got david carr third day
1,0,upset can not updat facebook text might cri re...
2,0,dive mani time ball manag save rest go bound
3,0,whole bodi feel itchi like fire
4,0,behav im mad can not see


## 8.3. Drop rows with NaN

In [11]:
test_df.isna().sum()

sentiment           0
Snowball_Stem    8046
dtype: int64

In [12]:
test_df = test_df.dropna()

In [13]:
test_df.isna().sum()

sentiment        0
Snowball_Stem    0
dtype: int64

## 8.4. Tokenization and padding

In [14]:
test_tweet = test_df['Snowball_Stem']

In [15]:
test_label = test_df['sentiment']

In [16]:
test_tweet.head()

0          aww bummer shoulda got david carr third day
1    upset can not updat facebook text might cri re...
2         dive mani time ball manag save rest go bound
3                      whole bodi feel itchi like fire
4                             behav im mad can not see
Name: Snowball_Stem, dtype: object

In [17]:
test_label.head()

0    0
1    0
2    0
3    0
4    0
Name: sentiment, dtype: int64

In [24]:
test_tweet_seq = tokenizer.texts_to_sequences(test_tweet)

In [25]:
test_tweet_pad = pad_sequences(test_tweet_seq, maxlen=max_seq_length, padding="post", truncating="post")

In [26]:
test_tweet_pad[0]

array([ 100, 1017, 3041,   10,  696, 6252, 1615,    3,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

## 8.5. Evaluate

In [27]:
scores = model.evaluate(test_tweet_pad, test_label)

 2897/49749 [>.............................] - ETA: 1:57 - loss: 0.6562 - accuracy: 0.6210

KeyboardInterrupt: 

In [28]:
scores

NameError: name 'scores' is not defined

In [None]:
loss, accuracy = scores

In [None]:
print("Loss on test set:", loss)
print("Accuracy achieve on test set:", accuracy)

---

# 9. Save model and tokenizer

In [None]:
model.save("./SavedModels/LSTM_train_75_val_78_test_79_acc.h5")

In [None]:
with open('./SavedModels/LSTM_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)