# Disaster related tweet text data classification with Bidirectional Long-Short Term Memory (B-LSTM)

# 1. Import the dependencies.

In [None]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 2. Split data function.
- _random_indexes find indexes of the testing set.
- split_data splits the data into training and testing data.

In [None]:
def _random_indexes(number, size, random_state):  # For selecting the indexes for test features
    if size > number:
        raise ValueError(str(size) + " features can't be chosen out of " + str(number))
    random_indexes = []
    random.seed(random_state)
    random_index = random.randrange(0, number, 1)
    random_indexes.append(random_index)
    for _ in range(1, size, 1):
        random_index = random.randrange(0, number, 1)
        while random_index in random_indexes:
            random_index = random.randrange(0, number, 1)

        random_indexes.append(random_index)
    random_indexes = np.array(random_indexes)

    return random_indexes


def split_data(features, targets, test_size, random_state=4):
    number_of_samples = len(targets)
    t_size = test_size * number_of_samples
    t_size = int(t_size) + 1

    random_indexes = _random_indexes(number_of_samples, t_size, random_state)

    x_training, x_testing, y_training, y_testing = [], [], [], []
    features = list(features)
    targets = list(targets)
    for i in range(len(random_indexes)):
        x_testing.append(features[random_indexes[i]])
        y_testing.append(targets[random_indexes[i]])

    for i in range(len(features)):
        if i in random_indexes:
            pass
        else:
            x_training.append(features[i])
            y_training.append(targets[i])

    x_training, x_testing, y_training, y_testing = np.array(x_training), np.array(x_testing), np.array(y_training), \
        np.array(y_testing)

    return x_training, x_testing, y_training, y_testing

# 3. Load the datasets.
- Load the training and testing data.
- Tokenize the sentences of each tweet.
- Turn tokens into text sequences.


In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/NLP TRAIN AND TEST/nlp_tweet_train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/NLP TRAIN AND TEST/nlp_tweet_test.csv")

X_train_sentences, X_valid_sentences, y_train_labels, y_valid_labels = split_data(train_data["text"], train_data["target"], test_size=0.2, random_state=42)
X_test_sentences = test_data["text"]
y_test_labels = test_data["target"]

tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_sentences)
word_index = tokenizer.word_index

X_train_sequences = tokenizer.texts_to_sequences(X_train_sentences)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100, padding='post', truncating='post')
X_valid_sequences = tokenizer.texts_to_sequences(X_valid_sentences)
X_valid_padded = pad_sequences(X_valid_sequences, maxlen=100, padding='post', truncating='post')
X_test_sequences = tokenizer.texts_to_sequences(X_test_sentences)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100, padding='post', truncating='post')

X_train_padded = np.array(X_train_padded)
X_valid_padded = np.array(X_valid_padded)
y_train_labels = np.array(y_train_labels)
y_valid_labels = np.array(y_valid_labels)
X_test_padded = np.array(X_test_padded)
y_test_labels = np.array(y_test_labels)

# 4. Build a B-LSTM model.
- Create a Sequential model.
- Add word embedding layer.
- Add a bidirectionl layer with LSTM layer.
- Finally add two densely connected layers with ReLU, and sigmoid activation functions respectively.
- Compile the model with 'binary_crossentropy', 'adam', and 'accuracy'.
- Fit the model to the training data.
- Evaluate the model and make predictions.

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(100, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X_train_padded, y_train_labels, epochs=30, validation_data=(X_valid_padded, y_valid_labels), verbose=1)
loss, accuracy = model.evaluate(X_valid_padded, y_valid_labels)
print("Loss: ", loss, ", accuracy: ", accuracy)

temp_valid_preds = model.predict(X_valid_padded)
temp_test_preds = model.predict(X_test_padded)
valid_preds, test_preds = [], []
threshold = 0.5

for temp_valid_pred in temp_valid_preds:
  if temp_valid_pred >= threshold:
    valid_preds.append(1)
  else:
    valid_preds.append(0)

for temp_test_pred in temp_test_preds:
  if temp_test_pred >= threshold:
    test_preds.append(1)
  else:
    test_preds.append(0)

print("\n")
print("Evaluation on the validation data.")
print("Accuracy score on the validation data: ", accuracy_score(y_valid_labels, valid_preds))
print("Precision score on the validation data: ", precision_score(y_valid_labels, valid_preds))
print("Recall score on the validation data: ", recall_score(y_valid_labels, valid_preds))
print("F1 score on the validation data: ", f1_score(y_valid_labels, valid_preds))
print("Confusion matrix on the validation data: ", confusion_matrix(y_valid_labels, valid_preds))
print("\n")
print("Evaluation on the testing data.")
print("Accuracy score on the testing data: ", accuracy_score(y_test_labels, test_preds))
print("Precision score on the testing data: ", precision_score(y_test_labels, test_preds))
print("Recall score on the testing data: ", recall_score(y_test_labels, test_preds))
print("F1 score on the testing data: ", f1_score(y_test_labels, test_preds))
print("Confusion matrix on the testing data: ", confusion_matrix(y_test_labels, test_preds))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Loss:  0.6803502440452576 , accuracy:  0.6890894174575806
Evaluation on the validation data.
Accuracy score on the validation data:  0.6890894175553732
Precision score on the validation data:  0.6380368098159509
Recall score on the validation data:  0.6070038910505836
F1 score on the validation data:  0.6221335992023928
Confusion matrix on the validation data:  [[528 177]
 [202 312]]


Evaluation on the testing data.
Accuracy score on the testing data:  0.7025607353906763
Precision score on the testing data:  0.6555555555555556
Recall score on the testing data:  0.6363636363636364
F1 score on the testing data:  0.6458170445660673
Confusion m