# Disaster related tweet text data classification with Multinomial Naive Bayes.

# 1. Import the dependencies.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import random
import numpy as np

# 2. Split data function.
- _random_indexes find indexes of the testing set.
- split_data splits the data into training and testing data.

In [None]:
def _random_indexes(number, size, random_state):  # For selecting the indexes for test features
    if size > number:
        raise ValueError(str(size) + " features can't be chosen out of " + str(number))
    random_indexes = []
    random.seed(random_state)
    random_index = random.randrange(0, number, 1)
    random_indexes.append(random_index)
    for _ in range(1, size, 1):
        random_index = random.randrange(0, number, 1)
        while random_index in random_indexes:
            random_index = random.randrange(0, number, 1)

        random_indexes.append(random_index)
    random_indexes = np.array(random_indexes)

    return random_indexes


def split_data(features, targets, test_size, random_state=4):
    number_of_samples = len(targets)
    t_size = test_size * number_of_samples
    t_size = int(t_size) + 1

    random_indexes = _random_indexes(number_of_samples, t_size, random_state)

    x_training, x_testing, y_training, y_testing = [], [], [], []
    features = list(features)
    targets = list(targets)
    for i in range(len(random_indexes)):
        x_testing.append(features[random_indexes[i]])
        y_testing.append(targets[random_indexes[i]])

    for i in range(len(features)):
        if i in random_indexes:
            pass
        else:
            x_training.append(features[i])
            y_training.append(targets[i])

    x_training, x_testing, y_training, y_testing = np.array(x_training), np.array(x_testing), np.array(y_training), \
        np.array(y_testing)

    return x_training, x_testing, y_training, y_testing

# 3. Metrics
- _confusion_matrix computes true positives, fales positives, true negatives, and false negatives.
- accuracy_score, precision_score, recall_score, f1_score andd confusion_matrix compute accuracy, precison, recall, and f1 scores and confusion matrix.

In [None]:
def _confusion_matrix(y_testing, y_prediction):
    # Computing confusion matrix
    length_of_labels = len(y_testing)
    true_positive, false_positive, true_negative, false_negative = 0, 0, 0, 0

    for i in range(length_of_labels):
        if y_testing[i] == 1:
            if y_testing[i] == y_prediction[i]:
                true_positive += 1

            else:
                false_positive += 1

        if y_testing[i] == 0:
            if y_testing[i] == y_prediction[i]:
                true_negative += 1

            else:
                false_negative += 1

    return true_positive, false_positive, true_negative, false_negative


def accuracy_score(y_testing, y_preds):
    tp, fp, tn, fn = _confusion_matrix(y_testing, y_preds)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    return accuracy


def precision_score(y_testing, y_preds):
    tp, fp, tn, fn = _confusion_matrix(y_testing, y_preds)
    precision = tp / (tp + fp)
    return precision


def recall_score(y_testing, y_preds):
    tp, fp, tn, fn = _confusion_matrix(y_testing, y_preds)
    tp_fn = tp + fn
    if tp_fn == 0:
        return 0.0
    else:
        recall = tp / tp_fn
        return recall


def f1_score(y_testing, y_preds):
    precision = precision_score(y_testing, y_preds)
    recall = recall_score(y_testing, y_preds)
    precision_recall = precision + recall
    if precision_recall == 0:
        return 0.0
    else:
        f1 = (2 * precision * recall) / precision_recall
        return f1


def confusion_matrix(y_testing, y_preds):
    tp, fp, tn, fn = _confusion_matrix(y_testing, y_preds)
    con_mat = []
    positives = [tp, fp]
    negatives = [fn, tn]
    con_mat.append(positives)
    con_mat.append(negatives)
    return con_mat

# 4. Main function.
- Split data into train and validation set.
- Extracts Term-Frequency Inverse Document Frequency (TF-IDF).
- Create a Multinomial Naive Bayes and fit the train data.
- Evaluate the model.

In [None]:
if __name__ == "__main__":
   train_data = pd.read_csv("/content/drive/MyDrive/NLP TRAIN AND TEST/nlp_tweet_train.csv")
   test_data = pd.read_csv("/content/drive/MyDrive/NLP TRAIN AND TEST/nlp_tweet_test.csv")

   X_train, X_valid, y_train, y_valid = split_data(train_data["text"], train_data["target"], test_size=0.2, random_state=42)

   vectorizer = TfidfVectorizer(max_features=5000)
   vectorized_X_train = vectorizer.fit_transform(X_train)

   bayes_classifier = MultinomialNB()
   bayes_classifier.fit(vectorized_X_train, y_train)
   vectorized_X_valid = vectorizer.transform(X_valid)

   X_valid_preds = bayes_classifier.predict(vectorized_X_valid)

   valid_accuracy = accuracy_score(y_valid, X_valid_preds)
   valid_precision = precision_score(y_valid, X_valid_preds)
   valid_recall = recall_score(y_valid, X_valid_preds)
   valid_f1 = f1_score(y_valid, X_valid_preds)
   valid_confusion_mat = confusion_matrix(y_valid, X_valid_preds)
   print("Evaluation on the validation data.")
   print("Accuracy score on the validation data:", valid_accuracy)
   print("Precision score on the validation data:", valid_precision)
   print("Recall score on the validation data:", valid_recall)
   print("F1 score on the validation data:", valid_f1)
   print("Confusion matrix on the validation data:", valid_confusion_mat)

   vectorized_X_test = vectorizer.transform(test_data["text"])
   y_test = test_data["target"]
   X_test_preds = bayes_classifier.predict(vectorized_X_test)

   print("\n \n")
   test_accuracy = accuracy_score(y_test, X_test_preds)
   test_precision = precision_score(y_test, X_test_preds)
   test_recall = recall_score(y_test, X_test_preds)
   test_f1 = f1_score(y_test, X_test_preds)
   test_confusion_mat = confusion_matrix(y_test, X_test_preds)
   print("Evaluation on the testing data.")
   print("Accuracy score on the testing data:", test_accuracy)
   print("Precision score on the testing data:", test_precision)
   print("Recall score on the testing data:", test_recall)
   print("F1 score on the testing data:", test_f1)
   print("Confusion matrix on the testing data:", test_confusion_mat)

Evaluation on the validation data.
Accuracy score on the validation data: 0.8031173092698933
Precision score on the validation data: 0.6439688715953308
Recall score on the validation data: 0.8530927835051546
F1 score on the validation data: 0.7339246119733925
Confusion matrix on the validation data: [[331, 183], [57, 648]]

 

Evaluation on the testing data.
Accuracy score on the testing data: 0.8003939592908733
Precision score on the testing data: 0.6486902927580893
Recall score on the testing data: 0.8470824949698189
F1 score on the testing data: 0.7347294938917975
Confusion matrix on the testing data: [[421, 228], [76, 798]]
