<a href="https://colab.research.google.com/github/RohitGanji/NLP-Disaster_Tweets/blob/main/Natural_Language_Processing_with_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Data

In [4]:
# Import csv data from drive
import pandas as pd

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# Helper functions
import string
import re
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
lemmatizer = nltk.stem.WordNetLemmatizer()
stopword = nltk.corpus.stopwords.words('english')

def clean_data(text):
  # Removing urls
  text = re.sub(r"http\S+", "", text).strip()
  # Removing @username
  text = re.sub(r"@\S+", "", text).strip()
  # Removing non-ASCII
  text = text.encode("ascii", "ignore").decode().strip()
  # Removing punctuation marks
  no_punct = [word for word in text if word not in string.punctuation]
  text = "".join(no_punct).strip()
  # Lower the text
  text = text.lower()
  # Lemmatization
  lemm_text = [lemmatizer.lemmatize(word) for word in text.split()]
  text = " ".join(lemm_text).strip()
  # Removing stopwords
  stopwords = [word for word in text.split() if word not in stopword]
  text = " ".join(stopwords)
  # remove additional space from string 
  text = re.sub(' +', ' ', text)
  return text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Cleaning the data

In [5]:
# Clean the data
train["text"] = train.apply(lambda x: clean_data(x["text"]), axis=1)

In [6]:
# Shuffle the data
train_shuffled = train.sample(frac=1)
train_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
6839,9796,trapped,"Greensburg, PA",know new release week blood call ancient evil ...,0
2536,3640,desolation,"Quilmes , Arg",desperation dislocation separation condemnatio...,1
4261,6055,heat%20wave,Somewhere in Spain,well seen thats bummer weve heat wave tho 43c ...,1
4190,5953,hazard,,davis drug guide nurse judith hopfer deglin ap...,0
7168,10272,war%20zone,We're All Mad Here,packing ct aka room look like war zone,0


In [7]:
# Split the data
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_shuffled["text"].to_numpy(),
                                                                            train_shuffled["target"].to_numpy(),
                                                                            test_size=0.1)

print(train_sentences.shape, val_sentences.shape, train_labels.shape, val_labels.shape)

(6851,) (762,) (6851,) (762,)


In [8]:
train_sentences[:10], train_labels

(array(['add familia arson squad', 'found sunflower explosion walk',
        'hope get batista bombed lauren',
        'first night retainer quite weird better get used wear every single night next year least',
        'date release ep03 desolation set stay tuned info finalise schedule alt electro rock comingsoon',
        'lie though pay oldest sometimes like first get car curfew freedom donthate',
        'twitter update pretty much wrecked app',
        'photo weallheartonedirection wouldnt let david electrocute im asshole',
        'something kissing mass murderer doesnt sit right feel ok',
        'diretube information egypt cyprus greece agreed fightterrorism'],
       dtype=object), array([0, 1, 0, ..., 0, 0, 1]))

# Model Experiments

In [9]:
import keras.backend as K

def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [11]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers

# Create USE layer
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name="USE")

In [12]:
# Build the model
model = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(128, activation="relu"),
    layers.Dense(1, activation="sigmoid")
], name="model_1_USE")

# Comile the model
model.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy", f1_metric])

In [13]:
# Fit the model
history = model.fit(train_sentences,
                        train_labels,
                        epochs=10,
                        validation_data=(val_sentences, val_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
model.evaluate(val_sentences, val_labels)



[0.4701420068740845, 0.7821522355079651, 0.7288091778755188]

In [18]:
## Generating submit csv
test_sentences = test.apply(lambda x: clean_data(x["text"]), axis=1).to_numpy()
preds = tf.squeeze(tf.round(model.predict(test_sentences)))
preds = tf.cast(preds, dtype=tf.int32)
sample_submission["target"] = preds
sample_submission.to_csv("submit.csv", index=False)

In [20]:
sample_submission["target"].value_counts()

0    2005
1    1258
Name: target, dtype: int64