# Natural Language Processing - Identify disaster Tweets

In [3]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score,log_loss
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
# =============================================================================
# 0. General functions
# =============================================================================
def word_vector(df_input, lemmatizer, word_vectors, vocabulary, col_sentences):
 """
 Function to preprocess the input words and get a list with
 The embeddings arrays of the words in each record.
 Parameters
 ----------
 df_input : dataframe
 input dataframe with all texts.
 lemmatizer : object
 NLTK stemming object.
 word_vectors : object
 Object with the word2vecs of the Gensim vocabulary.
 vocabulary : list
 list of existing words in Gensim's vocabulary.
 col_sentences : str
 column of the dataframe where the phrases are.
 Returns
 -------
 X : list
 List of lists in which each record has the list with the arrays
 of the embeddings of the words of that phrase. That is, X[0] has

 a list where each element corresponds to the embeddings of a word.
 Thus, for example, X[0][2] will be a vector of dimension 100 where it appears
 the vector of embeddings of the third word of the first sentence.
 """


 X = []

 for text in df_input[col_sentences]:

    # Tokenize every phrase
    words = re.findall(r'\w+', text.lower(),flags = re.UNICODE)
    # Elimination of stop_words
    words = [word for word in words if word not in stopwords.words('english')]
    # Remove hyphens and other weird symbols
    words = [word for word in words if not word.isdigit()] # Elimino numeros
    # Stemming
    words = [lemmatizer.lemmatize(w) for w in words]
    # Delete words that are not in the vocabulary
    words = [word for word in words if word in vocabulary]
    # Word2Vec
    words_embeddings = [word_vectors[x] for x in words]

    # Save the final sentence
    X.append(words_embeddings) # save as a numpy array
 return X

In [4]:
def create_RNN(x_train, K, n_lstm=8, loss='categorical_crossentropy', optimizer='adam'):
 """
 Function to create the RNN. As input parameter we only need the array
 of features to specify the input dimensionality of the NN.
 Parameters
 ----------
 x_input : array
 Matrix of input features.
 K: int
 Exit classes
 n_lstm : int, optional
 Number of lstm used. The default is 8.
 loss : string, optional
 loss metric. The default is 'categorical_crossentropy'.
 optimizer : string, optional
 optimizer. The default is 'adam'.
 Returns
 -------
 model : object
 Trained model.
"""

  # Begin sequence
 model = tf.keras.Sequential()

 # Add a LSTM layer with 8 internal units.
 model.add(LSTM(n_lstm, input_shape=x_train.shape[-2:]))

 # Add Dropout
 # model.add(Dropout(0.5))

 # # Another layer
 # model.add(Dense(100, activation='relu'))

 # # Output
 model.add(Dense(K, activation='sigmoid'))

 # Compile model
 model.compile(loss=loss, optimizer=optimizer)

 return model

As a next step, we load the exercise data. As the exercice mentions, the dataset consists of a
table with tweets that may or may not be talking about natural disasters. The idea is therefore to
build a tweet classifier that allows detecting when users are talking about it (and distinguishing
those tweets from others) in order to act on it.These types of applications, identified within what
is called Big Data For Social Good, contribute positively to society, and allow complementing the information available with certain systems (e.g. seismographs, weather predictors ...) with
insights derived from unstructured data in real time, as is the case with tweets.

In [8]:
#==========================================================================
# 1. Load Data
#==========================================================================
# Load files
tf.random.set_seed(42)
path_files = "predict-disaster"
df_raw = pd.read_csv('train.csv', encoding = "latin-1")
df_raw = df_raw[['text', 'target']]
print(df_raw)

                                                   text  target
0     Our Deeds are the Reason of this #earthquake M...       1
1                Forest fire near La Ronge Sask. Canada       1
2     All residents asked to 'shelter in place' are ...       1
3     13,000 people receive #wildfires evacuation or...       1
4     Just got sent this photo from Ruby #Alaska as ...       1
...                                                 ...     ...
7608  Two giant cranes holding a bridge collapse int...       1
7609  @aria_ahrary @TheTawniest The out of control w...       1
7610  M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...       1
7611  Police investigating after an e-bike collided ...       1
7612  The Latest: More Homes Razed by Northern Calif...       1

[7613 rows x 2 columns]


Thus, tweets appear within these two categories. An example of a tweet that talks about natural
disasters is the following:

In [9]:
print(df_raw['text'][10])

Three people died from the heat wave so far


In [None]:
Instead, a tweet that doesn't talk about it is:

In [10]:
print(df_raw['text'][16])

I love fruits


In [12]:
print(df_raw['target'].value_counts() / len(df_raw))


0    0.57034
1    0.42966
Name: target, dtype: float64


In [17]:
# Shuffle input
df_raw = df_raw.sample(frac=1)
# Load word2vec
word_vectors = api.load("glove-wiki-gigaword-100")
vocabulary = [x for x in word_vectors.vocab]

AttributeError: ignored

In this case, the output class (binary value) is already expressed numerically, so it is not
necessary to do LabelEncoding. Also, being a binary variable, it is not necessary to do
OneHotEncoding either.

In [14]:
# Set lemmatizer
lemmatizer = WordNetLemmatizer()
# X/y split
X = pd.DataFrame(df_raw['text'])
y = df_raw['target']

Subsequently, we pre-process the tweets to express them in numerical format by embeddings
of the individual words, after having eliminated stopwords, having lemmatized, eliminated words
that are not in the vocabulary and eliminated numerical characters.

In [None]:
#==========================================================================
# 2. Preprocess
#=========================================================================

# Obtain X variable and prepare y.
X = word_vector(X,
 lemmatizer,
 word_vectors,
 vocabulary,
 col_sentences="text")

After that, we make the separation between the train dataset and the one that we will use for the
test, and we define the maximum size of the sequence per tweet. We take the maximum
sequence size as a reference to encompass the size of the entire tweet for 99% of the tweets in
the dataset. We apply a subsequent padding to fill the shorter tweets with null values up to that
maximum length size, and we truncate the largest tweets until we have that maximum size,
eliminating at the end of the tweet.

In [None]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
 X, y, test_size=0.25, random_state=42)
# Obtain tensor: [N_SENTENCES x SEQ_LENGTH x EMBEDDING_FEATURES]
SEQ_LENGTH = np.int(np.round(np.percentile([len(x) for x in X], 99, interpolation =
'midpoint')))
data_train = pad_sequences(X_train,
 maxlen=SEQ_LENGTH,
 padding="post",
 truncating="post")
data_test = pad_sequences(X_test,
 maxlen=SEQ_LENGTH,
 padding="post",
 truncating="post")

In a first iteration, we get the output for the following configuration: batch_size = 200, epochs =
50, optimize = adam, n_lstm = 50. In this case, since it is a binary problem, K = 1.

In [None]:
#========================================================================
# 3. Train model
# ========================================================================
# Params
K = 1
batch_size = 200
epochs = 50
# Create RNN
model = create_RNN(x_train = data_train, K = K, n_lstm = 50, loss = 'binary_crossentropy', optimizer = 'adam')
print(model.summary())
# Fit model
model.fit(data_train, y_train, epochs = epochs, batch_size = batch_size)
# Save model
model.save('model_nlp_disaster.h5')

After training and storing the model, we obtain the predictions along with different metrics:
confusion matrix, precision, recall and F1. The predictions, having used the sigmoid function,
will be expressed as a continuous value between 0 and 1. We round this value (with a standard
threshold of 0.5) to see the final class associated with each prediction.

In [None]:
# ========================================================================
# 4. Evaluate
# ========================================================================
# Obtain predictions
y_pred = model.predict(data_test)
# Round predictions
y_pred = y_pred.round()
y_pred = [x[0] for x in y_pred]
y_test = list(y_test.values)
# Evaluate results
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix: ", cm)
print("Precision: ", np.round(precision_score(y_test, y_pred, average='macro'), 4))
print("Recall: ", np.round(recall_score(y_test, y_pred, average='macro'), 4))
print("f1_score: ", np.round(f1_score(y_test, y_pred, average='macro'), 4))

We can see how the metrics are approximately the same with all the configurations that use
ADAM as an optimization function, and we only notice that they get substantially worse when
changing the algorithm to SGD. In this way, we would opt for the last configuration, since the
metrics are similar and instead its configuration is much simpler, greatly reducing the
computational cost.