### Importing libraries

In [1]:
import numpy as np 
import pandas as pd 
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,roc_auc_score
import tensorflow as tf
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM, Bidirectional,GlobalMaxPool1D,BatchNormalization,Dropout
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [2]:
# Reading processed training dataset
train_df = pd.read_csv('train_processed.csv')

In [3]:
# Printing top 5 rows of the dataset
print(train_df.head())

   id keyword location                                               text  \
0   1     NaN      NaN         deed reason earthquake may allah forgive u   
1   4     NaN      NaN              forest fire near la ronge sask canada   
2   5     NaN      NaN  resident asked shelter place notified officer ...   
3   6     NaN      NaN  people receive wildfire evacuation order calif...   
4   7     NaN      NaN  got sent photo ruby alaska smoke wildfire pour...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [4]:
# Setting target variable as 'train_label' and text variable as 'train_text'
train_text = train_df['text'].values
train_label = train_df['target'].values

In [5]:
# Splitting above dataset into 80-20 'train' and 'test' datasets
xtrain, xtest, ytrain, ytest = train_test_split(train_text, train_label, test_size=0.2, 
                                                    stratify=train_label, random_state=12345, shuffle=True)

In [6]:
# Converting xtrain into a matrix of TF-IDF features
# It is used to convert textual data into numbers and it captures the importance of each word within a document
tfidf_vect = TfidfVectorizer(max_df=0.9, min_df=5)
tfidf_vect.fit(xtrain.astype('U'))
train_df_matrix = tfidf_vect.transform(xtrain.astype('U'))
train_df_matrix.shape

(6090, 2105)

In [7]:
# Converting xtest into a matrix of TF-IDF features
test_df_matrix = tfidf_vect.transform(xtest.astype('U'))
test_df_matrix.shape

(1523, 2105)

In [8]:
# Converting above matrices into arrays so that they can be input into training models
xtrain_array = train_df_matrix.toarray()
xtest_array = test_df_matrix.toarray()

In [9]:
# Loading XGBoost model and training it on the 'train' dataset
# Evaluating the model's performce on the 'test' dataset
# The model is achieving the accuracy score of ~78% and ROC AUC score of ~76%
model_xgb = XGBClassifier(random_state=42)
model_xgb = model_xgb.fit(xtrain_array, ytrain)
preds = model_xgb.predict(xtest_array)

ac = accuracy_score(ytest,preds)
print('Accuracy is: ',ac)
print('ROCAUC score:',roc_auc_score(ytest, preds))
cm = confusion_matrix(ytest,preds)
cm

Accuracy is:  0.7820091923834537
ROCAUC score: 0.763768506103891


array([[776,  93],
       [239, 415]])

### Text vectorization and word embeddings

In [10]:
# Using TextVectorization to convert textual data into numerical vector representation
# Also, looking at the words in the vocabulary
encoder = keras.layers.TextVectorization()
encoder.adapt(xtrain)
vocabulary = np.array(encoder.get_vocabulary())
vocabulary

array(['', '[UNK]', 'like', ..., '100', '0npzp', '0lv6'], dtype='<U52')

In [11]:
# Printing shapes of encoded data and vocabulary
print(encoder(xtrain).numpy().shape)
print(vocabulary.shape)

(6090, 23)
(14048,)


In [12]:
# Defining function to load Glove embeddings
def load_glove_embeddings(embeddings_path):
    """Load GloVe embeddings from file into memory."""
    embeddings_index = {}
    with open(embeddings_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

In [13]:
# Loading GloVe embeddings using above defined function
# Using 'glove.6B.200d.txt' file which was downloaded from Kaggle; it has embeddings in 200 dimensions
glove_embeddings_path = 'glove.6B.200d.txt'
glove_embeddings = load_glove_embeddings(glove_embeddings_path)

In [14]:
# Creating an embedding matrix for the words in the dataset using GloVe embeddings
embedding_matrix = np.zeros((len(vocabulary), 200))
for word, i in enumerate(vocabulary):
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [15]:
embedding_matrix.shape

(14048, 200)

In [16]:
# Setting vocabulary length and embedding dimension
vocab_len = len(encoder.get_vocabulary())
emb_dim = 200

### Splitting the dataset into train, test and valid

In [17]:
# checking the maximum number of words that can be present in a text, this will help us in padding later
train_df['text'].apply(lambda x:len(str(x).split())).max()

23

In [18]:
# Defining function for building Bidirectional LSTM model architecture
def build_network_bidirectional():
    weight_initializer = tf.keras.initializers.GlorotNormal()
    model = Sequential()
    model.add(encoder)
    model.add(Embedding(input_dim=vocab_len, output_dim=emb_dim))
    model.add(Bidirectional(LSTM(200,return_sequences = True,dropout = 0.5,recurrent_dropout=0.2)))
    model.add(GlobalMaxPool1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(200, activation = "relu",kernel_initializer=weight_initializer,bias_initializer='zeros'))
    model.add(Dropout(0.5))
    model.add(Dense(200, activation = "relu",kernel_initializer=weight_initializer,bias_initializer='zeros'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation = 'sigmoid'))
    return model

In [19]:
# Defining function to train model
def train_model(model,xtrain,ytrain):
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.AUC(name='auc')])
    history = model.fit(xtrain, ytrain, epochs= 2, validation_split=0.2,batch_size = 32) 
    print(model.summary())
    return model, history

In [20]:
# Training model using above functions
model_bidirectional, history = train_model(build_network_bidirectional(), xtrain,ytrain)

Epoch 1/2
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - auc: 0.5383 - loss: 0.7090 - val_auc: 0.8255 - val_loss: 0.6641
Epoch 2/2
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 37ms/step - auc: 0.8665 - loss: 0.4487 - val_auc: 0.8429 - val_loss: 0.5883


None


In [21]:
# Evaluating the model's performance on test dataset; ROC score is 
preds = model_bidirectional.predict(xtest)
roc_auc_score(ytest, preds)


[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step


0.8506719734800097