In [1]:
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade

Collecting keras-core
  Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting namex (from keras-core)
  Downloading namex-0.0.8-py3-none-any.whl (5.8 kB)
Installing collected packages: namex, keras-core
Successfully installed keras-core-0.1.7 namex-0.0.8
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m570.5/570.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [9

In [2]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

In [40]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import keras_core as keras
import keras_nlp
import spacy
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
print("TensorFlow version:", tf.__version__)
print("KerasNLP version:", keras_nlp.__version__)

TensorFlow version: 2.16.1
KerasNLP version: 0.12.1


In [5]:
df_train = pd.read_csv("/content/train.csv")
df_test = pd.read_csv("/content/test.csv")

In [7]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [8]:
nlp = spacy.load("en_core_web_lg")
def preprocess(text):
  doc = nlp(text)
  clean_text = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
  return ' '.join(clean_text)

In [9]:
df_train['preprocessed_text'] = df_train['text'].apply(preprocess)
df_test['preprocessed_text'] = df_test['text'].apply(preprocess)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(df_train["text"], df_train["target"], test_size=0.2, random_state=42)
x_train_pp, x_test_pp, y_train_pp, y_test_pp = train_test_split(df_train["preprocessed_text"], df_train["target"], test_size=0.2, random_state=42)

Using Count Vectorizer for text representation and Multinomial Naive Bayes algorithm for classification.

In [12]:
#1 On raw data
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       874
           1       0.81      0.70      0.75       649

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



In [14]:
#1 On preprocessed data
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])
clf.fit(x_train_pp, y_train_pp)
y_pred = clf.predict(x_test_pp)
print(classification_report(y_test_pp, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.84      0.82       874
           1       0.77      0.73      0.75       649

    accuracy                           0.79      1523
   macro avg       0.79      0.79      0.79      1523
weighted avg       0.79      0.79      0.79      1523



In [15]:
#Predicting for test data
tweets = df_test["preprocessed_text"]
preds = clf.predict(tweets)

In [16]:
#Predictions analysis
print("Total number of tweets = ", len(df_test), " |  No. of tweets predicted as disaster tweets = ", len(np.where(preds == 1)[0]), " | No. of tweets predicted as non disaster = ", len(np.where(preds == 0)[0]))

Total number of tweets =  3263  |  No. of tweets predicted as disaster tweets =  1286  | No. of tweets predicted as non disaster =  1977


Using Bag of n-grams for text representation and Multinomial Naive Bayes algorithm for classification.

In [17]:
#Training on raw text
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('classifier', MultinomialNB())
])
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       874
           1       0.80      0.70      0.74       649

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.79      1523



In [18]:
#Training on preprocessed text
clf = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('classifier', MultinomialNB())
])
clf.fit(x_train_pp, y_train_pp)
y_pred = clf.predict(x_test_pp)
print(classification_report(y_test_pp, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.82      0.82       874
           1       0.76      0.74      0.75       649

    accuracy                           0.79      1523
   macro avg       0.78      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523



Using TF-IDF text vectorizer for text representation and K-nearest neighbours, Multinomial Naive Bayes and Random Forest algorithm for classification.

In [21]:
#On raw data - KNN
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', KNeighborsClassifier())
])

clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.86      0.82       874
           1       0.78      0.66      0.72       649

    accuracy                           0.78      1523
   macro avg       0.78      0.76      0.77      1523
weighted avg       0.78      0.78      0.77      1523



In [22]:
#On preprocessed data - KNN
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', KNeighborsClassifier())
])

clf.fit(x_train_pp, y_train_pp)
y_pred = clf.predict(x_test_pp)
print(classification_report(y_test_pp, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.86      0.81       874
           1       0.78      0.64      0.70       649

    accuracy                           0.77      1523
   macro avg       0.77      0.75      0.76      1523
weighted avg       0.77      0.77      0.76      1523



In [23]:
print(x_test[:5], y_test[:5], y_pred[0:5])

2644    So you have a new weapon that can cause un-ima...
2227    The f$&amp;@ing things I do for #GISHWHES Just...
5448    DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...
132     Aftershock back to school kick off was great. ...
6845    in response to trauma Children of Addicts deve...
Name: text, dtype: object 2644    1
2227    0
5448    1
132     0
6845    0
Name: target, dtype: int64 [0 0 1 0 1]


In [24]:
#On raw data - Multinomial NB
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.93      0.84       874
           1       0.86      0.63      0.73       649

    accuracy                           0.80      1523
   macro avg       0.82      0.78      0.78      1523
weighted avg       0.81      0.80      0.79      1523



In [26]:
#On preprocessed data - Multiomial NB
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

clf.fit(x_train_pp, y_train_pp)
y_pred = clf.predict(x_test_pp)
print(classification_report(y_test_pp, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.88      0.83       874
           1       0.81      0.66      0.73       649

    accuracy                           0.79      1523
   macro avg       0.79      0.77      0.78      1523
weighted avg       0.79      0.79      0.78      1523



In [27]:
#On raw data - Random Forest
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.92      0.83       874
           1       0.84      0.62      0.71       649

    accuracy                           0.79      1523
   macro avg       0.80      0.77      0.77      1523
weighted avg       0.80      0.79      0.78      1523



In [28]:
#On preprocessed data - Random Forest
clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

clf.fit(x_train_pp, y_train_pp)
y_pred = clf.predict(x_test_pp)
print(classification_report(y_test_pp, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.90      0.83       874
           1       0.82      0.62      0.71       649

    accuracy                           0.78      1523
   macro avg       0.79      0.76      0.77      1523
weighted avg       0.79      0.78      0.78      1523



Using word embedding for text representation

In [29]:
nlp = spacy.load(r"en_core_web_lg")
df_train['vector'] = df_train['text'].apply(lambda x: nlp(x).vector)
df_train.head()

Unnamed: 0,id,keyword,location,text,target,preprocessed_text,vector
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake ALLAH forgive,"[-0.965633, 0.086769275, -2.2507236, 0.4783793..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[-1.8681643, -1.6165241, -0.21609005, -1.76522..."
2,5,,,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...,"[-2.4070368, 0.27042598, -1.3639991, 0.7013667..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive wildfire evacuation orde...","[-1.0255101, -0.15121445, -2.2331533, 1.724021..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,get send photo Ruby Alaska smoke wildfire pour...,"[0.08517767, -0.10672835, -1.9185519, 0.144816..."


In [31]:
x_train_v, x_test_v, y_train_v, y_test_v = train_test_split(df_train["vector"].values, df_train["target"], test_size=0.2, random_state=2022)

In [32]:
x_train_2d = np.stack(x_train_v)
x_test_2d = np.stack(x_test_v)

scalar = MinMaxScaler()
x_train_2d_scaled = scalar.fit_transform(x_train_2d)
x_test_2d_scaled = scalar.transform(x_test_2d)

In [33]:
clf = MultinomialNB()
clf.fit(x_train_2d_scaled, y_train_v)
y_pred = clf.predict(x_test_2d_scaled)
print(classification_report(y_test_v, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.70      0.71       862
           1       0.63      0.66      0.65       661

    accuracy                           0.68      1523
   macro avg       0.68      0.68      0.68      1523
weighted avg       0.69      0.68      0.68      1523



In [34]:
clf = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean')
clf.fit(x_train_2d_scaled, y_train_v)

y_pred = clf.predict(x_test_2d_scaled)
print(classification_report(y_test_v, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.82      0.79       862
           1       0.74      0.66      0.70       661

    accuracy                           0.75      1523
   macro avg       0.75      0.74      0.75      1523
weighted avg       0.75      0.75      0.75      1523



Using word vectors in Gensim overview

In [None]:
import gensim.downloader as api

wv = api.load("glove-twitter-50")



In [36]:
#preprocess the text through en_core_web_lg so that it includes word embeddings
def preprocess_and_vectorize(text):
  filtered_tokens = preprocess(text)
  return wv.get_mean_vector(filtered_tokens.split(' ')) #used .split since we need the list with each word as a string element

In [37]:
df_train['gensim_vector'] = df_train['text'].apply(lambda text: preprocess_and_vectorize(text))

In [38]:
x_train, x_test, y_train, y_test = train_test_split(df_train["gensim_vector"].values, df_train["target"], test_size=0.2, random_state=2022, stratify=df_train["target"])

In [39]:
x_train_2d = np.stack(x_train)
x_test_2d = np.stack(x_test) #Get flattened sample

In [41]:
clf = GradientBoostingClassifier()
clf.fit(x_train_2d, y_train)

y_pred = clf.predict(x_test_2d)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.84      0.80       869
           1       0.76      0.67      0.71       654

    accuracy                           0.77      1523
   macro avg       0.77      0.76      0.76      1523
weighted avg       0.77      0.77      0.77      1523

