# Week 4: NLP Disaster Tweets Kaggle Mini-Project

In [None]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter

import keras
from keras import layers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


## Brief description of the problem and data (5 pts)

*Briefly describe the challenge problem and NLP. Describe the size, dimension, structure, etc., of the data.*

## Exploratory Data Analysis (EDA) — Inspect, Visualize and Clean the Data (15 pts)

*Show a few visualizations like histograms. Describe any data cleaning procedures. Based on your EDA, what is your plan of analysis?*

Checking the general structure of the data and potential duplicates. 

In [None]:
def count_words(df):
    for i, row in df.iterrows():
        df.loc[i, 'word_count'] = len(row.text.split())
        txt = df.loc[i, 'text']
        txt = re.sub(r'https?://\S+|www.\S+', '', txt) # Remove URLs
        txt = re.sub(r'[^a-z0-9A-Z\s]', '', txt) # Remove numbers
        # txt = txt.lower()
        df.loc[i, 'text'] = txt
    df['word_count'] = df['word_count'].astype(int)
    all_text = ' '.join(df.text)
    unique_words = len(set(all_text.split()))
    return unique_words

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_unique_words = count_words(train_df)
test_unique_words = count_words(test_df)

all_text = ' '.join(((pd.concat([train_df,test_df], axis=0)).text.values))
all_unique_words = len(set(all_text.split()))


print('\n' + 40*'*' + ' Train dataset ' + 40*'*')
train_df.info()
print('\nNumerical statistics:\n', train_df.describe())
print('\n', train_df.head(4), '\n')
# print('\n', train_df.tail(3))
print('Number of duplicated rows:', np.sum(train_df.duplicated()))
print('Number of duplicated texts:', np.sum(train_df.duplicated(subset='text')))
print('Longest tweet has', np.max(train_df.word_count), 'words.')
print('Unique words in the dataset:', train_unique_words)
print('Target values:', pd.unique(train_df.target))
y_split = round(100 * np.sum(train_df.target == 1)/len(train_df.target))
print('Target split: \n1 (disaster) =', y_split, '%\n0 (not disaster) =', 100-y_split, '%')

print('\n' + 40*'*' + ' Test dataset ' + 40*'*')
test_df.info()
print('\nNumerical statistics:\n', test_df.describe())
print('\n', test_df.head(4), '\n')
# print('\n', test_df.tail(3))
print('Number of duplicated rows:', np.sum(test_df.duplicated()))
print('Number of duplicated texts:', np.sum(test_df.duplicated(subset='text')))
print('Longest tweet has', np.max(test_df.word_count), 'words.')
print('Unique words in the dataset:', test_unique_words)


sns.histplot(train_df, x='word_count', bins=30, stat='percent')
sns.histplot(test_df, x='word_count', bins=30, stat='percent')
plt.grid(axis='y')
plt.title('Histogram of approximate word counts in the data')
plt.legend(['Train','Test'])
plt.show()

Remove duplicates from the training set

In [None]:
print('Size with duplicated texts:', len(train_df))
train_df.drop_duplicates(subset='text', inplace=True)
print('Size without duplicated texts:', len(train_df))

## Model Architecture (25 pts)

*Describe your model architecture and reasoning for why you believe that specific architecture would be suitable for this problem.*

*Since we did not learn NLP-specific techniques such as word embeddings in the lectures, we recommend looking at Kaggle tutorials, discussion boards, and code examples posted for this challenge.  You can use any resources needed, but make sure you “demonstrate” you understood by including explanations in your own words. Also importantly, please have a reference list at the end of the report.*

*There are many methods to process texts to matrix form (word embedding), including TF-IDF, GloVe, Word2Vec, etc. Pick a strategy and process the raw texts to word embedding. Briefly explain the method(s) and how they work in your own words.*

*Build and train your sequential neural network model (You may use any RNN family neural network, including advanced architectures LSTM, GRU, bidirectional RNN, etc.).*

### Text to matrix

In [None]:
maxlen = 300
max_features = 10000

In [None]:
my_vectorizer = keras.layers.TextVectorization(
    max_tokens=max_features,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    ngrams=2,
    output_mode="int",
    output_sequence_length=None,
    pad_to_max_tokens=None,
    vocabulary=None,
    idf_weights=None,
    sparse=False,
    ragged=False,
    encoding="utf-8",
    name=None,
)
my_vectorizer.adapt(train_df['text'])
x_train = my_vectorizer(train_df['text'])

x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)
# x_test = keras.utils.pad_sequences(x_test, maxlen=maxlen)
y_train = train_df.target

print('Shape:', x_train.shape)
print('Min and max:', np.min(x_train), np.max(x_train))

In [None]:
train_df.loc[:,['text','keyword','location']]

In [None]:
# my_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.95, min_df=2, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
# my_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.95, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 1), stop_words='english')
# my_vectorizer = TfidfVectorizer(sublinear_tf=True)
my_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.95, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english', max_features=max_features)
my_vectorizer.fit(train_df.text)
x_data = my_vectorizer.transform(train_df['text'])
x_train = x_data
y_train = train_df.target

print('Shape of x_train:', x_train.shape)
print('Shape of y_train:', y_train.shape)

ftr_names = my_vectorizer.get_feature_names_out()
print('Length of ftr_names:',len(ftr_names))
print('ftr_names:', my_vectorizer.get_feature_names_out())

x_test = my_vectorizer.transform(test_df['text'])
print('Shape of X_test:', x_test.shape)


### Use Tensor Flow data sets

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(train_df.text.values)

# Define the TextVectorization layer
max_features = 10000  # Maximum vocabulary size
sequence_length = 100  # Maximum sequence length

vectorize_layer = keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)

# Adapt the layer to the dataset
vectorize_layer.adapt(dataset)

# Apply the TextVectorization layer to the dataset
def vectorize_text(text):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)

vectorized_dataset = dataset.map(vectorize_text)

# Example: Print the first 5 tokenized sequences
for vectorized_text in vectorized_dataset.take(5):
    print(vectorized_text.numpy())


target_ds = tf.data.Dataset.from_tensor_slices(train_df.target.values)

train_ds = tf.data.Dataset.zip((vectorized_dataset, target_ds))


In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((train_df.text.values, train_df.target.values))
test_ds =  tf.data.Dataset.from_tensor_slices((test_df.text.values))
all_txt = pd.concat([train_df.text, test_df.text], axis=0)
train_test_ds = tf.data.Dataset.from_tensor_slices((all_txt.values))


In [None]:
for txt in test_ds.take(5):
    tf.print(txt.numpy()[ :50])
for txt, trg in train_ds.take(5):
    tf.print(trg, '\t', txt.numpy()[ :50])
for txt in train_test_ds.take(5):
    tf.print(txt.numpy()[ :50])

In [None]:
text_count = len(train_df)
val_size = int(text_count * 0.2)
print('Validation size:', val_size)
train_ds = train_ds.shuffle(text_count, reshuffle_each_iteration=False)
val_ds = train_ds.take(val_size)
train_ds = train_ds.skip(val_size)


In [None]:
all_unique_words

# Initialize the tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=all_unique_words, split=' ', oov_token='<OOV>')

# Fit the tokenizer on the sentences
tokenizer.fit_on_texts(all_txt)

word_index = tokenizer.word_index
print(word_index['a'])

sequences = tokenizer.texts_to_sequences(all_txt)
print(len(sequences))

padded_sequences = keras.preprocessing.sequence.pad_sequences(sequences, padding='post')
print(padded_sequences.shape)


In [None]:
print(len(all_txt))
print(word_index['hello'])
print(all_unique_words)

In [None]:
def encode_train(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

def encode_map_fn_train(text, label):
    return tf.py_function(encode_train, inp=[text, label], Tout=(tf.int64, tf.int64))

def encode_unseen(text_tensor):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, 1

def encode_map_fn_unseen(text):
    return tf.py_function(encode_unseen, inp=[text], Tout=(tf.int64, 1))

In [None]:
tokeniser = tfds.deprecated.text.Tokenizer()
token_counts = Counter()

for example in test_ds:
    tokens = tokeniser.tokenize(example.numpy())
    token_counts.update(tokens)

In [None]:

encoder = tfds.deprecated.text.TokenTextEncoder(test_ds)
example_string = "This is an example"
print(f"Exmaple String: {example_string}")
print(f"Encoded String: {encoder.encode(example_string)}")

In [None]:
def encode_train(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text =  tokenizer.texts_to_sequences(text)
    return encoded_text, label

def encode_test(text_tensor):
    # text = text_tensor.numpy()[0]
    # text = tf.strings.reduce_join(text_tensor, separator=' ').numpy().decode('utf-8')
    text = tf.expand_dims(text, -1)
    encoded_text =  tokenizer.texts_to_sequences(text)
    return encoded_text

def encode_map_fn_train(text, label):
    return tf.py_function(encode_train, inp=[text, label], Tout=(tf.int64, tf.int64))

def encode_map_fn_test(text):
    tokens = tf.py_function(encode_test, inp=[text], Tout=(tf.int64))
    tokens = tf.convert_to_tensor(tokens)
    return tokens

def _fixup_shape(text, label):
    text.set_shape([])
    label.set_shape([])
    return text, label

def _fixup_test_shape(text):
    text.set_shape([])
    return text

# train_ds = train_ds.map(encode_map_fn_train)
# val_ds = val_ds.map(encode_map_fn_train)
test_ds = test_ds.map(encode_map_fn_test)
# test_ds = test_ds.map(_fixup_test_shape)
test_ds = test_ds.padded_batch(32, padded_shapes=([-1]))


In [None]:
for txt in test_ds.take(5):
    tf.print(txt.numpy()[ :50])

In [None]:
print("Example Sequences and their length:\n")
example = train_ds.take(8)
for ex in example:
    print(f"Individual Size: {ex[0].shape}")
print("Batched examples and the sequence length:\n")
batched_example = example.padded_batch(4, padded_shapes=([-1], []))
for batch in batched_example:
    print(f"Batch dimension: {batch[0].shape}")

In [None]:
train_ds = train_ds.padded_batch(32, padded_shapes=([-1], []))
tweets_valid = tweets_valid.padded_batch(32, padded_shapes=([-1], []))
tweets_unseen_batched = tweets_unseen_map.padded_batch(32, padded_shapes=([-1], []))

### Model building

#### LSTM

In [None]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 LSTMs
x = layers.LSTM(128, return_sequences=True)(x)
x = layers.LSTM(128, return_sequences=True)(x)
x = layers.LSTM(128)(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()

#### Bi-directional LSTM

In [None]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()

#### GRU

In [None]:
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 LSTMs
# x = layers.LSTM(128, return_sequences=True)(x)
x = layers.GRU(64, return_sequences=True)(x)
x = layers.GRU(64)(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()

In [None]:
## https://www.kaggle.com/code/anmolstha/disaster-tweets-simple-rnn-implementation
# Long Short Term Memory network.

# We need sequential model to process sequence of text data
model = keras.models.Sequential()

# Embedding(input_dimension, output_dimension,embeddings_initializer = initialize the embedding matrix we created, trainable = do not train)
embedding= layers.Embedding(max_features, 100, trainable=False)
# Adding Embedding Layer
model.add(embedding)

# Drops 40% of entire row
model.add(layers.SpatialDropout1D(0.4))

# Recurrent Layer LSTM(dimensionality of the output space, dropout = 20%, recurrent_dropout = 20%) 
model.add(layers.GRU(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(layers.GRU(64, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(layers.GRU(64, dropout=0.2, recurrent_dropout=0.2))

# Decide what we are going to output Dense(units, activation function)
model.add(layers.Dense(1, activation='sigmoid'))

# Compile the model compile(loss = binary crossentropy, use Adam(adaptive moment estimation) optimizer with learning rate 1e-3,evaluate based on accuracy)
model.compile(loss='binary_crossentropy',optimizer=keras.optimizers.Adam(learning_rate=1e-4), metrics=['accuracy'])

model.summary()

### Model training

max length = 1000

3rd layer 0.5391

256 dim 0.5391




In [None]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3), loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2)

In [None]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3), loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(train_ds, batch_size=32, epochs=1, validation_data=val_ds)
# history = model.fit(train_ds, batch_size=32, epochs=1, validation_split=0.2)


In [None]:
history.history


## Results and Analysis (35 pts)

*Run hyperparameter tuning, try different architectures for comparison, apply techniques to improve training or performance, and discuss what helped.*

*Includes results with tables and figures. There is an analysis of why or why not something worked well, troubleshooting, and a hyperparameter optimization procedure summary.*



## Conclusion (15 pts)

*Discuss and interpret results as well as learnings and takeaways. What did and did not help improve the performance of your models? What improvements could you try in the future?*



# Sources
https://keras.io/examples/nlp/bidirectional_lstm_imdb/
