# Natural Language Processing with Disaster Tweets

Required imports

In [13]:
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

Load data from CSV file and split for test and train datasets

In [14]:
df = pd.read_csv('data/train.csv')
raw_text = df['text'].values
labels_train = df['target'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(raw_text, labels_train, test_size=0.25, random_state=1000)

Tokenize tweets using bag-of-words

In [15]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)

Model 1: Logistic Regression

In [16]:
classifier = LogisticRegression(solver='liblinear')
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.803046218487395


Model 2: Neural network without hidden layers

In [17]:
input_dim = X_train.shape[1]  # Number of features
model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 10)                177010    
                                                                 
 dense_5 (Dense)             (None, 1)                 11        
                                                                 
Total params: 177,021
Trainable params: 177,021
Non-trainable params: 0
_________________________________________________________________


Train model and print results

In [11]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Accuracy: 0.9953
Testing Accuracy:  0.7883
