In [3]:
import numpy as np     
import pandas as pd    
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# load the dataset containing normalized text as well as the additional features obtained from notebook 1

df_train = pd.read_csv('Data\\disaster_tweets_kaggle\\train_clean_add_feat.csv', index_col = False)

df_train.head()

Unnamed: 0,text,target,text_norm,stemmed_keyword,disaster_asc_coeff,length_norm,neg,neu,pos
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquak may allah forgiv us,earthquak,0.785714,41,0.0,1.0,0.0
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada,fire,0.386364,36,0.286,0.714,0.0
2,All residents asked to 'shelter in place' are ...,1,resid ask shelter place notifi offic evacu she...,evacu,0.767241,69,0.0,1.0,0.0
3,"13,000 people receive #wildfires evacuation or...",1,13000 peopl receiv wildfir evacu order california,evacu,0.767241,49,0.0,1.0,0.0
4,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi alaska smoke wildfir pour ...,fire,0.386364,52,0.0,1.0,0.0


In [5]:
# In this part we will compare results using the original text with minimum processing vs the normalized text 
df_orig = df_train[['text', 'target']]
df_norm = df_train[['text_norm', 'target']]

## 1) Training with Normalized Text Data
For this text data, stop words, symbols, urls and punctuations have been removed. In addition, the words have also been stemmed. This means that the vocabulariy will be smaller; however, any contextual information provided by stop words will be lost.

In [6]:
from tensorflow.keras.preprocessingcessing.text import Tokenizer

In [7]:
tokenizer = Tokenizer()

In [12]:
tokenizer.fit_on_texts(df_norm['text_norm'])

In [14]:
tokenizer.texts_to_sequences(df_norm['text_norm'][0])

[[],
 [1179],
 [1179],
 [],
 [],
 [857],
 [1179],
 [],
 [],
 [],
 [600],
 [],
 [1179],
 [],
 [857],
 [],
 [2014],
 [5121],
 [103],
 [],
 [11038],
 [],
 [],
 [],
 [],
 [],
 [],
 [2569],
 [2569],
 [],
 [2014],
 [],
 [1344],
 [],
 [857],
 [1941],
 [],
 [1257],
 [],
 [103],
 []]

In [16]:
# get max length to pad sequences and abd vocab size
VOCAB_SIZE = len(tokenizer.word_index) + 1
MAX_LENGTH = max([len(text.split()) for text in df_norm['text_norm']])
print(f'Vocab size: {VOCAB_SIZE}')
print(f'Max sequence length: {MAX_LENGTH}')

Vocab size: 19542
Max sequence length: 25


In [17]:
# we will set embedding dimension to be 100
EMBEDDING_DIM = 100

### Train Test Split

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X = df_norm['text_norm']
y = df_norm['target']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [21]:
X_train.head()

5797    accionempresa chinaûª stock market crash summ...
3232    feel engulf low selfimag take quiz httptcoykvs...
5145    mccainenl think spectacular look stonewal riot...
4911    magic citi mayhem kissimme adventur aug 5 2015...
253     us nation park servic tonto nation forest stop...
Name: text_norm, dtype: object

In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
# tokenize text data
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

In [26]:
# apply padding
X_train_pad = pad_sequences(X_train_tokens, maxlen = MAX_LENGTH, padding = 'post')
X_test_pad = pad_sequences(X_test_tokens, maxlen = MAX_LENGTH, padding = 'post')

In [27]:
type(X_train_pad)

numpy.ndarray

### Building Model