Each sample in the train and test set has the following information:

The text of a tweet
A keyword from that tweet (although this may be blank!)
The location the tweet was sent from (may also be blank)

Task is to predict whether a given tweet is about a real disaster or not. If so, predict a 1. If not, predict a 0.

Importing the Libraries

In [76]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

Importing the train dataset

In [77]:
dataset = pd.read_csv("train.csv")
dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Cleaning the text

In [5]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range (0,7613):
  tweet = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
  tweet = tweet.lower()
  tweet = tweet.split()
  ps = PorterStemmer()
  tweet = [ps.stem(word) for word in tweet if not word in set(stopwords.words('english'))]
  tweet = ' '.join(tweet)
  corpus.append(tweet)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Splitting the corpus into Train and Test sets

In [44]:
from sklearn.model_selection import train_test_split
sentence_train, sentence_test, y_train, y_test = train_test_split(corpus, dataset.iloc[:,-1].values, test_size=0.5)

Vectorize the text corpus into a list of integers

In [46]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentence_train)
X_train = tokenizer.texts_to_sequences(sentence_train)
X_test = tokenizer.texts_to_sequences(sentence_test)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
print(sentence_train[2])
print(X_train[2])

11734
fedporn feel pain survivor look back period absurd human histori satir indistinguish realiti
[3736, 84, 674, 234, 37, 49, 2513, 3737, 402, 496, 3738, 3739, 1935]


Pad zeros with Keras

In [47]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [48]:
print(X_train[0, :])

[282   4  64  26   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]


Build the model

In [73]:
from keras.models import Sequential
from keras import layers

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, output_dim=50, input_length=maxlen))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_75"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_75 (Embedding)     (None, 100, 50)           586700    
_________________________________________________________________
global_max_pooling1d_75 (Glo (None, 50)                0         
_________________________________________________________________
dense_149 (Dense)            (None, 10)                510       
_________________________________________________________________
dense_150 (Dense)            (None, 1)                 11        
Total params: 587,221
Trainable params: 587,221
Non-trainable params: 0
_________________________________________________________________


Fit the model

In [74]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=100)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Evaluate it's performance

In [75]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9609
Testing Accuracy:  0.7993


In [57]:
test_dataset = pd.read_csv("test.csv")
id = test_dataset.iloc[:,0].values

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
test_corpus = []
for i in range (0,3263):
  tweet = re.sub('[^a-zA-Z]', ' ', test_dataset['text'][i])
  tweet = tweet.lower()
  tweet = tweet.split()
  ps = PorterStemmer()
  tweet = [ps.stem(word) for word in tweet if not word in set(stopwords.words('english'))]
  tweet = ' '.join(tweet)
  test_corpus.append(tweet)

val_test = tokenizer.texts_to_sequences(test_corpus)
val_test = pad_sequences(val_test, padding='post', maxlen=maxlen)
result = (model.predict(val_test))
print(result)
y_pred = []
for i in result:
  y_pred.append(int(round(i[0])))
y_pred = np.array(y_pred)
print(y_pred)
rows = (np.concatenate((id.reshape(len(id),1), y_pred.reshape(len(y_pred), 1)),1))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[[0.8710285 ]
 [0.18318215]
 [0.9353117 ]
 ...
 [0.8419656 ]
 [0.9386988 ]
 [0.5403851 ]]
[1 0 1 ... 1 1 1]


In [59]:
print(len(rows))

3263


In [60]:
import csv
fields = ['id', 'target']
filename = "./submission.csv"
# writing to csv file  
with open(filename, 'w') as csvfile:  
    # creating a csv writer object  
    csvwriter = csv.writer(csvfile)  
        
    # writing the fields  
    csvwriter.writerow(fields)  
        
    # writing the data rows  
    csvwriter.writerows(rows) 

In [61]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [68]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

# Main settings
epochs = 10
embedding_dim = 50
maxlen = 100

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 64, 128],
                kernel_size=[3, 5, 7],
                vocab_size=[vocab_size],
                embedding_dim=[embedding_dim],
                maxlen=[maxlen])
model = KerasClassifier(build_fn=create_model,
                      epochs=epochs, batch_size=10,
                      verbose=False)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                        cv=4, verbose=1, n_iter=5)
grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)

Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 10.1min finished


In [69]:
print( grid_result.best_score_, grid_result.best_params_, test_accuracy)

0.7506560981273651 {'vocab_size': 11734, 'num_filters': 32, 'maxlen': 100, 'kernel_size': 7, 'embedding_dim': 50} 0.7743630409240723
