In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='whitegrid',palette='muted',font_scale=1.5)
from pylab import rcParams
rcParams['figure.figsize'] = 16,10 

In [2]:
raw = pd.read_csv('train.csv',index_col='id')
raw.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
raw = raw.drop(['keyword','location'], axis=1)
raw.columns

Index(['text', 'target'], dtype='object')

In [4]:
data = raw['text']
y = raw['target'].values

In [5]:
from collections import Counter
Counter(y)

Counter({1: 3271, 0: 4342})

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size = 0.2)

In [10]:
MAX_VOCAB_SIZE = 25000

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [12]:
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(data.values)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


In [13]:
wordidx = tokenizer.word_index
Vlength = len(wordidx)
Vlength #unique tokens

22700

In [14]:
X_train = pad_sequences(X_train) #keep pre-padding since LSTM would lose the information otherwise
seq_length = X_train.shape[1]

In [15]:
X_train.shape

(6090, 33)

In [16]:
X_test = pad_sequences(X_test,maxlen=seq_length)
X_test.shape

(1523, 33)

In [40]:
X_train[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,   13,  820,    8,
       1977,   10,   34, 7753, 1436, 7754,  724,  224,    4,  802,  400])

In [42]:
y_train.shape

(6090,)

### Modelling

In [17]:
#whats the proportion of positives in the set actually? Do I use accuracy or ROC/AUC? 
np.count_nonzero(y==1)/len(y)

0.4296597924602653

In [18]:
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import LSTM, Embedding
from tensorflow.keras.models import Model

D = 20 #Embedding dimensionality
#M = 15 # hidden state dimensionality

In [22]:
i = Input(shape=(seq_length,))
x = Embedding(Vlength+1,D)(i)
x = LSTM(D, return_sequences = True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation = 'sigmoid')(x)

model = Model(i,x)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [24]:
model.summary()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 33)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 33, 20)            454020    
_________________________________________________________________
lstm_1 (LSTM)                (None, 33, 20)            3280      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 20)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 21        
Total params: 457,321
Trainable params: 457,321
Non-trainable params: 0
_________________________________________________________________


In [25]:
ES = keras.callbacks.EarlyStopping(monitor = 'val_accuracy', patience = 1, restore_best_weights = True)
history = model.fit(X_train, y_train, epochs = 20, callbacks = [ES], validation_data = (X_test, y_test),
                    batch_size = 32, verbose = 2)  #callbacks = [ES]

Epoch 1/20
191/191 - 2s - loss: 0.6184 - accuracy: 0.6506 - val_loss: 0.4968 - val_accuracy: 0.7814
Epoch 2/20
191/191 - 2s - loss: 0.3869 - accuracy: 0.8453 - val_loss: 0.4407 - val_accuracy: 0.8168
Epoch 3/20
191/191 - 3s - loss: 0.2386 - accuracy: 0.9200 - val_loss: 0.4723 - val_accuracy: 0.8004


In [26]:
#well that's not very awesome, let's try adding more dimensions maybe?

In [27]:
def train_model(D,M, X = X_train, y=y_train):
    i = Input(shape=(seq_length,))
    x = Embedding(Vlength+1,D)(i)
    x = LSTM(D, return_sequences = True)(x) #M
    x = GlobalMaxPooling1D()(x)
    x = Dense(1, activation = 'sigmoid')(x)

    model = Model(i,x)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    ES = keras.callbacks.EarlyStopping(monitor = 'val_accuracy', patience = 2, restore_best_weights = True)
    history = model.fit(X, y, epochs = 20, callbacks = [ES], validation_data = (X_test, y_test), verbose = 2)
    return model

In [28]:
model_2= train_model(40,30)

Epoch 1/20
191/191 - 3s - loss: 0.5924 - accuracy: 0.6808 - val_loss: 0.4523 - val_accuracy: 0.8056
Epoch 2/20
191/191 - 3s - loss: 0.3453 - accuracy: 0.8612 - val_loss: 0.4461 - val_accuracy: 0.8056
Epoch 3/20
191/191 - 3s - loss: 0.2006 - accuracy: 0.9297 - val_loss: 0.5544 - val_accuracy: 0.8017


In [44]:
#okay, around 80% possible accuracy, let's put it to the test alright!
#submission must be in format id, target
raw_test = pd.read_csv('test.csv', index_col = 'id')
raw_test.head()


Unnamed: 0_level_0,keyword,location,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,,Just happened a terrible car crash
2,,,"Heard about #earthquake is different cities, s..."
3,,,"there is a forest fire at spot pond, geese are..."
9,,,Apocalypse lighting. #Spokane #wildfires
11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [45]:
test_texts = raw_test['text']

In [46]:
test_sequences = tokenizer.texts_to_sequences(test_texts)

In [47]:
test_data = pad_sequences(test_sequences, maxlen=seq_length)

In [51]:
pred_1, pred_2  = [1 if n >= 0.5 else 0 for n in model.predict(test_data)], [1 if n >= 0.5 else 0 for n in model_2.predict(test_data)]

In [52]:
submission1 = pd.DataFrame(index = raw_test.index, data = pred_1)

In [54]:
submission1.columns = ['target']

In [55]:
submission2 = pd.DataFrame(index = raw_test.index, data = pred_2)
submission2.columns = ['target']

In [60]:
submission1.to_csv('sub1.csv')
submission2.to_csv('sub2.csv')

# Output of this notebook scored a public score of 0.79374 (sub1)

With this score I'm somewhere in the middle of the scoreboard

I'm still not quite sure this is the right approach