# Week 4: NLP Disaster Tweets Kaggle Mini-Project

In [None]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter

import keras
from keras import layers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


## Brief description of the problem and data (5 pts)

*Briefly describe the challenge problem and NLP. Describe the size, dimension, structure, etc., of the data.*

## Exploratory Data Analysis (EDA) — Inspect, Visualize and Clean the Data (15 pts)

*Show a few visualizations like histograms. Describe any data cleaning procedures. Based on your EDA, what is your plan of analysis?*

Checking the general structure of the data and potential duplicates. 

In [None]:
def count_words(df):
    for i, row in df.iterrows():
        df.loc[i, 'word_count'] = len(row.text.split())
        txt = df.loc[i, 'text']
        txt = re.sub(r'https?://\S+|www.\S+', '', txt) # Remove URLs
        txt = re.sub(r'[^a-z0-9A-Z\s]', '', txt) # Remove numbers
        # txt = txt.lower()
        df.loc[i, 'text'] = txt
    df['word_count'] = df['word_count'].astype(int)
    all_text = ' '.join(df.text)
    unique_words = len(set(all_text.split()))
    return unique_words

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_unique_words = count_words(train_df)
test_unique_words = count_words(test_df)

all_text = ' '.join(((pd.concat([train_df,test_df], axis=0)).text.values))
all_unique_words = len(set(all_text.split()))


print('\n' + 40*'*' + ' Train dataset ' + 40*'*')
train_df.info()
print('\nNumerical statistics:\n', train_df.describe())
print('\n', train_df.head(4), '\n')
# print('\n', train_df.tail(3))
print('Number of duplicated rows:', np.sum(train_df.duplicated()))
print('Number of duplicated texts:', np.sum(train_df.duplicated(subset='text')))
print('Longest tweet has', np.max(train_df.word_count), 'words.')
print('Unique words in the dataset:', train_unique_words)
print('Target values:', pd.unique(train_df.target))
y_split = round(100 * np.sum(train_df.target == 1)/len(train_df.target))
print('Target split: \n1 (disaster) =', y_split, '%\n0 (not disaster) =', 100-y_split, '%')

print('\n' + 40*'*' + ' Test dataset ' + 40*'*')
test_df.info()
print('\nNumerical statistics:\n', test_df.describe())
print('\n', test_df.head(4), '\n')
# print('\n', test_df.tail(3))
print('Number of duplicated rows:', np.sum(test_df.duplicated()))
print('Number of duplicated texts:', np.sum(test_df.duplicated(subset='text')))
print('Longest tweet has', np.max(test_df.word_count), 'words.')
print('Unique words in the dataset:', test_unique_words)


sns.histplot(train_df, x='word_count', bins=30, stat='percent')
sns.histplot(test_df, x='word_count', bins=30, stat='percent')
plt.grid(axis='y')
plt.title('Histogram of approximate word counts in the data')
plt.legend(['Train','Test'])
plt.show()

Remove duplicates from the training set

In [None]:
print('Size with duplicated texts:', len(train_df))
train_df.drop_duplicates(subset='text', inplace=True)
print('Size without duplicated texts:', len(train_df))

In [232]:
all_text = pd.concat([train_df,test_df], axis=0).text.values

## Model Architecture (25 pts)

*Describe your model architecture and reasoning for why you believe that specific architecture would be suitable for this problem.*

*Since we did not learn NLP-specific techniques such as word embeddings in the lectures, we recommend looking at Kaggle tutorials, discussion boards, and code examples posted for this challenge.  You can use any resources needed, but make sure you “demonstrate” you understood by including explanations in your own words. Also importantly, please have a reference list at the end of the report.*

*There are many methods to process texts to matrix form (word embedding), including TF-IDF, GloVe, Word2Vec, etc. Pick a strategy and process the raw texts to word embedding. Briefly explain the method(s) and how they work in your own words.*

*Build and train your sequential neural network model (You may use any RNN family neural network, including advanced architectures LSTM, GRU, bidirectional RNN, etc.).*

### Text to matrix

In [None]:
max_features = 10000

In [237]:
my_vectorizer = keras.layers.TextVectorization(
    max_tokens=max_features,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    ngrams=1,
    output_mode="int",
    output_sequence_length=None,
    pad_to_max_tokens=None,
    vocabulary=None,
    idf_weights=None,
    sparse=False,
    ragged=False,
    encoding="utf-8",
    name=None,
)
my_vectorizer.adapt(all_text)
x_train = my_vectorizer(train_df['text'])
x_test = my_vectorizer(test_df['text'])
y_train = train_df.target

def check_vector(vect):
    print('Shape:', vect.shape)
    print('Min and max:', np.min(vect), np.max(vect))
    print(vect[0:4])

check_vector(x_train)
check_vector(x_test)

Shape: (7613, 31)
Min and max: 0 9999
tf.Tensor(
[[ 101 7788   21    2  818    6   19  242  137 2046 4160   68   40    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [ 155   44  203  803    1    1 1475    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [  40 1583 1556    4 2270    5  724   21  133    1   17 1813   37  276
   268   54 2270    5  724 1691   21 1257    0    0    0    0    0    0
     0    0    0]
 [3685   59 4775 1268  268 1691    5   99    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]], shape=(4, 31), dtype=int64)
Shape: (3263, 31)
Min and max: 0 9998
tf.Tensor(
[[  27  782    3 1490  125   89    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0]
 [ 362   52  242    9 1308 1885  639 1424  252    0    0    0 

### Model building

#### LSTM 1 - Long Short Term Memory network

In [238]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 LSTMs
x = layers.LSTM(128, return_sequences=True)(x)
x = layers.LSTM(128)(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()

#### LSTM 2 - Long Short Term Memory network

Single layer LSTM with dropout.

In [215]:
## Source: https://www.kaggle.com/code/anmolstha/disaster-tweets-simple-rnn-implementation

# We need sequential model to process sequence of text data
model = keras.models.Sequential()

# Embedding(input_dimension, output_dimension,embeddings_initializer = initialize the embedding matrix we created, trainable = do not train)
embedding= layers.Embedding(max_features, 128, trainable=False)
# Adding Embedding Layer
model.add(embedding)

# Drops 40% of entire row
model.add(layers.SpatialDropout1D(0.4))

# Recurrent Layer LSTM(dimensionality of the output space, dropout = 20%, recurrent_dropout = 20%) 
model.add(layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2))

# Decide what we are going to output Dense(units, activation function)
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

#### Bi-directional LSTM

In [None]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()

#### GRU

In [None]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 LSTMs
# x = layers.LSTM(128, return_sequences=True)(x)
x = layers.GRU(64, return_sequences=True)(x)
x = layers.GRU(64)(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()

### Model training

max length = 1000

3rd layer 0.5391

256 dim 0.5391




In [239]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4), loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(x_train, y_train, batch_size=32, epochs=5, validation_split=0.2)

Epoch 1/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.5692 - loss: 0.6848 - val_accuracy: 0.6474 - val_loss: 0.6649
Epoch 2/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.7753 - loss: 0.5031 - val_accuracy: 0.7768 - val_loss: 0.4753
Epoch 3/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.8632 - loss: 0.3320 - val_accuracy: 0.7820 - val_loss: 0.4745
Epoch 4/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.8945 - loss: 0.2739 - val_accuracy: 0.7708 - val_loss: 0.4885
Epoch 5/5
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.9152 - loss: 0.2345 - val_accuracy: 0.7525 - val_loss: 0.6139


In [None]:
history.history


## Results and Analysis (35 pts)

*Run hyperparameter tuning, try different architectures for comparison, apply techniques to improve training or performance, and discuss what helped.*

*Includes results with tables and figures. There is an analysis of why or why not something worked well, troubleshooting, and a hyperparameter optimization procedure summary.*

* Feature size: 1k, 5k, 10k, 20k
* LSTM vs bi-directional LSTM vs GRU
* 1 layer vs 2 layers vs 3 layers
* Number of units, dimensionality


|


In [240]:
predictions = model.predict(x_train, verbose=False)
pred_label = np.round(predictions,0)
accu = []
for pred, act in zip(pred_label, y_train):
    accu.append(pred==act)
np.mean(accu)

np.float64(0.9089714961250492)

In [251]:
predictions = model.predict(x_test, verbose=False)
pred_label = np.round(predictions,0)

In [256]:

test_df.id
print(x_test[0])
print(test_df.iloc[0])

submission_df = pd.DataFrame(test_df.id)
submission_df['target'] = pred_label.astype('int')
submission_df.head(5)
submission_df.to_csv('submission_1.csv', index=False)

tf.Tensor(
[  27  782    3 1490  125   89    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0], shape=(31,), dtype=int64)
id                                             0
keyword                                      NaN
location                                     NaN
text          Just happened a terrible car crash
word_count                                     6
Name: 0, dtype: object


## Conclusion (15 pts)

*Discuss and interpret results as well as learnings and takeaways. What did and did not help improve the performance of your models? What improvements could you try in the future?*



# Sources
https://keras.io/examples/nlp/bidirectional_lstm_imdb/
