# Bi Directional Model for classification of tweet sentiment

---

# 1. Installs and imports

## 1.1. Install all required libraries

In [1]:
# Uncomment line below to install all required libraries
# !pip3 install -r ../requirements.txt -q

## 1.2. Import required libraries

In [2]:
import pandas as pd
import numpy as np

from collections import Counter
import pickle

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.initializers import constant
from keras.optimizers import Adam

---

# 2. Load cleaned tweets dataset

In [None]:
df = pd.read_csv('./data/cleaned_tweets.csv')

In [None]:
df.head()

# 3. Drop text

In [None]:
df = df[['sentiment', 'Snowball_Stem']]

In [None]:
df.head()

# 4. Drop rows with NaN

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()

In [None]:
df.isna().sum()

---

# 5. Split dataset into training and test data

In [None]:
X = df['Snowball_Stem']

In [None]:
y = df['sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

- 80% training data
- 20% test data

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

---

# 6. Collection of all unique words in corpus

In [None]:
# Count of all unique words

def count_unique_words(tweets):
    unique = Counter()
    for tweet in tweets:
        for word in tweet.split():
            unique[word] += 1
    return unique


In [None]:
word_count = count_unique_words(X_train)

In [None]:
len(word_count)

---

# 7. Model

## 7.1. Max number of words in a sequence

In [3]:
max_seq_length = 20

## 7.2. Create / Load tokenizer

In [None]:
# tokenizer = Tokenizer(num_words=len(word_count))

In [4]:
with open('./SavedModels/BLSTM_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

## 7.3. Tokenize the text

In [None]:
tokenizer.fit_on_texts(X_train)

In [None]:
word_index = tokenizer.word_index

In [None]:
# Index for each word in tokenizer
len(word_index)

## 7.4. Convert training data to tokenized sequences

In [None]:
X_train[0]

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)

In [None]:
X_train_seq[0]

## 7.5. Padding training sequences

In [None]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length, padding="post", truncating="post")

In [None]:
X_train_pad[0]

## 7.6. Performing tokenization and padding for test set

In [None]:
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length, padding="post", truncating="post")

## 7.7. Understanding training and testing data

In [None]:
X_train_seq[0]

In [None]:
X_train_pad[0]

In [None]:
X_test_seq[0]

In [None]:
X_test_pad[0]

In [None]:
X_train_pad.shape

In [None]:
X_test_pad.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

## 7.8 Training the model (Custom embedding)

In [None]:
model = Sequential()

# Maps each word to a finite vector

model.add(Embedding(len(word_index) + 1, 20, input_length=max_seq_length))

# model.add(Bidirectional(LSTM(64, dropout = 0.2, return_sequences = True)))

model.add(Bidirectional(LSTM(64, dropout = 0.2)))

model.add(Dense(1, activation="sigmoid"))

optimizer = Adam(learning_rate=0.0001)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])


In [None]:
model.summary()

In [None]:
history = model.fit(X_train_pad, y_train, epochs=1, validation_data=(X_test_pad, y_test))

In [None]:
model.save("./SavedModels/B_LSTM_train_74_12_val_77_79_test_?_acc.h5")

# 7.9. Training the model (GloVe)

In [None]:
embeddings_index = dict()
f = open('./SavedModels/glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((len(word_index)+1, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
model = Sequential()

# Maps each word to a finite vector

model.add(Embedding(len(word_index)+1, 100, weights=[embedding_matrix], input_length=max_seq_length, trainable=False))

# model.add(Bidirectional(LSTM(64, dropout = 0.2, return_sequences = True)))

model.add(Bidirectional(LSTM(64, dropout = 0.2)))

model.add(Dense(1, activation="sigmoid"))

optimizer = Adam(learning_rate=0.0001)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])


In [None]:
history = model.fit(X_train_pad, y_train, epochs=3, validation_data=(X_test_pad, y_test))

In [None]:
model.summary()

In [None]:
model.save("./SavedModels/B_LSTM_GloVe_train_73_59_val_74_82_test_74_34_acc_epoch_4.h5")

---

# 8. Evaluating model

## 8.1. Load Model

In [None]:
# model = keras.models.load_model('./SavedModels/B_LSTM_train_76_26_val_77_86_test_78_57_acc.h5')

In [None]:
# model.summary()

In [6]:
model = keras.models.load_model('./SavedModels/B_LSTM_GloVe_train_73_59_val_74_82_test_74_34_acc_epoch_4.h5')
model.summary()

OSError: SavedModel file does not exist at: ./SavedModels/B_LSTM_GloVe_train_73_59_val_74_82_test_74_34_acc_epoch_4.h5/{saved_model.pbtxt|saved_model.pb}

## 8.2. Create test dataset

In [None]:
test_df = pd.read_csv('./data/cleaned_tweets.csv')

In [None]:
# test_df = pd.concat([test_df[test_df.sentiment != 0][:100000], test_df[test_df.sentiment == 0][:100000]])

In [None]:
test_df.head()

In [None]:
test_df = test_df[['sentiment', 'Snowball_Stem']]

In [None]:
test_df.head()

## 8.3. Drop rows with NaN

In [None]:
test_df.isna().sum()

In [None]:
test_df = test_df.dropna()

In [None]:
test_df.isna().sum()

## 8.4. Tokenization and padding

In [None]:
test_tweet = test_df['Snowball_Stem']

In [None]:
test_label = test_df['sentiment']

In [None]:
test_tweet.head()

In [None]:
test_label.head()

In [None]:
test_tweet_seq = tokenizer.texts_to_sequences(test_tweet)

In [None]:
test_tweet_pad = pad_sequences(test_tweet_seq, maxlen=max_seq_length, padding="post", truncating="post")

In [None]:
test_tweet_pad[0]

## 8.5. Evaluate

In [None]:
scores = model.evaluate(test_tweet_pad, test_label)

In [None]:
scores

In [None]:
loss, accuracy = scores

In [None]:
print("Loss on test set:", loss)
print("Accuracy achieve on test set:", accuracy)

---

# 9. Save model and tokenizer

In [None]:
model.save("./SavedModels/B_LSTM_train_76_26_val_77_86_test_78_57_acc.h5")

In [None]:
with open('./SavedModels/LSTM_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)