The only thing that differs from the RNN approach is actually the model so I just copy the preprocessing

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='whitegrid',palette='muted',font_scale=1.5)
from pylab import rcParams
rcParams['figure.figsize'] = 16,10 

In [2]:
raw = pd.read_csv('train.csv',index_col='id')
raw.head()

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
raw = raw.drop(['keyword','location'], axis=1)
raw.columns

Index(['text', 'target'], dtype='object')

In [4]:
data = raw['text']
y = raw['target'].values

In [5]:
from collections import Counter
Counter(y)

Counter({1: 3271, 0: 4342})

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size = 0.2)

In [7]:
MAX_VOCAB_SIZE = 25000

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [9]:
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(data.values)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [10]:
wordidx = tokenizer.word_index
Vlength = len(wordidx)
Vlength #unique tokens

22700

In [11]:
X_train = pad_sequences(X_train) #keep pre-padding since LSTM would lose the information otherwise
seq_length = X_train.shape[1]

In [12]:
X_train.shape

(6090, 32)

In [13]:
X_test = pad_sequences(X_test,maxlen=seq_length)
X_test.shape

(1523, 32)

### Modelling

In [14]:
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model

D = 20

In [15]:
def train_model(D=D, X = X_train, y=y_train):
    i = Input(shape=(seq_length,))
    x = Embedding(Vlength+1, D)(i)
    x = Conv1D(16, 1, activation = 'relu')(x)
    x = MaxPooling1D(1)(x)
    x = Conv1D(32, 1, activation = 'relu')(x)
    x = MaxPooling1D(1)(x)
    x = Conv1D(64, 1, activation = 'relu')(x)
    x = MaxPooling1D(1)(x)
    x = Conv1D(128, 1, activation = 'relu')(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(1, activation = 'sigmoid')(x)

    model = Model(i,x)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    ES = keras.callbacks.EarlyStopping(monitor = 'val_accuracy', patience = 2, restore_best_weights = True)
    model.fit(X, y, epochs = 20, callbacks = [ES], validation_data = (X_test, y_test), verbose = 2)
    return model

In [16]:
model_2 = train_model()

Epoch 1/20
191/191 - 2s - loss: 0.6489 - accuracy: 0.6107 - val_loss: 0.6171 - val_accuracy: 0.7032
Epoch 2/20
191/191 - 1s - loss: 0.5096 - accuracy: 0.7739 - val_loss: 0.5541 - val_accuracy: 0.7196
Epoch 3/20
191/191 - 1s - loss: 0.3543 - accuracy: 0.8534 - val_loss: 0.5736 - val_accuracy: 0.7511
Epoch 4/20
191/191 - 1s - loss: 0.2336 - accuracy: 0.9061 - val_loss: 0.6307 - val_accuracy: 0.7321
Epoch 5/20
191/191 - 1s - loss: 0.1653 - accuracy: 0.9360 - val_loss: 0.7159 - val_accuracy: 0.7262


In [17]:
def train_model_alt(D=D, X = X_train, y=y_train):
    
    model = keras.Sequential([
        Embedding(Vlength+1, D),
        Conv1D(16, 3, activation = 'relu'),
        MaxPooling1D(3),
        Conv1D(32, 3, activation = 'relu'),
        MaxPooling1D(3),
        Conv1D(64, 2, activation = 'relu'),
        GlobalMaxPooling1D(),
        Dense(1, activation = 'sigmoid')
    ])

    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    ES = keras.callbacks.EarlyStopping(monitor = 'val_accuracy', patience = 2, restore_best_weights = True)
    model.fit(X, y, epochs = 20, callbacks = [ES], validation_data = (X_test, y_test), verbose = 2)
    return model

In [18]:
model_alt = train_model_alt()

Epoch 1/20
191/191 - 1s - loss: 0.6286 - accuracy: 0.6361 - val_loss: 0.4969 - val_accuracy: 0.7649
Epoch 2/20
191/191 - 1s - loss: 0.3814 - accuracy: 0.8445 - val_loss: 0.4811 - val_accuracy: 0.7761
Epoch 3/20
191/191 - 1s - loss: 0.2189 - accuracy: 0.9204 - val_loss: 0.5654 - val_accuracy: 0.7748
Epoch 4/20
191/191 - 1s - loss: 0.1399 - accuracy: 0.9521 - val_loss: 0.6609 - val_accuracy: 0.7433


In [19]:
def train_model_alt2(D=D, X = X_train, y=y_train):
    
    model = keras.Sequential([
        Embedding(Vlength+1, D),
        Conv1D(16, 3, activation = 'relu'),
        MaxPooling1D(3),
        Conv1D(32, 3, activation = 'relu'),
        MaxPooling1D(3),
        Conv1D(64, 2, activation = 'relu'),
        MaxPooling1D(1),
        Conv1D(128, 1, activation = 'relu'),
        GlobalMaxPooling1D(),
        Dense(1, activation = 'sigmoid')
    ])

    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    ES = keras.callbacks.EarlyStopping(monitor = 'val_accuracy', patience = 2, restore_best_weights = True)
    model.fit(X, y, epochs = 20, callbacks = [ES], validation_data = (X_test, y_test), verbose = 2)
    return model

In [20]:
model_alt2 = train_model_alt2()

Epoch 1/20
191/191 - 1s - loss: 0.6296 - accuracy: 0.6425 - val_loss: 0.4978 - val_accuracy: 0.7735
Epoch 2/20
191/191 - 1s - loss: 0.3801 - accuracy: 0.8404 - val_loss: 0.4806 - val_accuracy: 0.7866
Epoch 3/20
191/191 - 1s - loss: 0.2001 - accuracy: 0.9319 - val_loss: 0.5808 - val_accuracy: 0.7794
Epoch 4/20
191/191 - 1s - loss: 0.1356 - accuracy: 0.9539 - val_loss: 0.6779 - val_accuracy: 0.7564


In [21]:
def train_model_alt3(D=D, X = X_train, y=y_train):
    
    model = keras.Sequential([
        Embedding(Vlength+1, D),
        Conv1D(32, 3, activation = 'relu'),
        MaxPooling1D(3),
        Conv1D(64, 3, activation = 'relu'),
        MaxPooling1D(3),
        Conv1D(128, 2, activation = 'relu'),
        GlobalMaxPooling1D(),
        Dense(1, activation = 'sigmoid')
    ])

    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    ES = keras.callbacks.EarlyStopping(monitor = 'val_accuracy', patience = 2, restore_best_weights = True)
    model.fit(X, y, epochs = 20, callbacks = [ES], validation_data = (X_test, y_test), verbose = 2)
    return model

In [22]:
model_alt3 = train_model_alt3()

Epoch 1/20
191/191 - 1s - loss: 0.5978 - accuracy: 0.6721 - val_loss: 0.4860 - val_accuracy: 0.7748
Epoch 2/20
191/191 - 1s - loss: 0.3497 - accuracy: 0.8565 - val_loss: 0.5489 - val_accuracy: 0.7741
Epoch 3/20
191/191 - 1s - loss: 0.1960 - accuracy: 0.9307 - val_loss: 0.6195 - val_accuracy: 0.7715


In [23]:
raw_test = pd.read_csv('test.csv', index_col = 'id')

In [24]:
test_texts = raw_test['text']
test_sequences = tokenizer.texts_to_sequences(test_texts)
test_data = pad_sequences(test_sequences, maxlen=seq_length)

In [25]:
models = [model_2,model_alt,model_alt2,model_alt3]


In [29]:
pred3, pred4, pred5, pred6 = [1 if n >= 0.5 else 0 for n in model_2.predict(test_data)], [1 if n >= 0.5 else 0 for n in model_alt.predict(test_data)], [1 if n >= 0.5 else 0 for n in model_alt2.predict(test_data)], [1 if n >= 0.5 else 0 for n in model_alt3.predict(test_data)]

In [31]:
submission3 = pd.DataFrame(index = raw_test.index, data = pred3)
submission4 = pd.DataFrame(index = raw_test.index, data = pred4)
submission5 = pd.DataFrame(index = raw_test.index, data = pred5)
submission6 = pd.DataFrame(index = raw_test.index, data = pred6)

In [32]:
submission3.columns = ['target']
submission4.columns = ['target']
submission5.columns = ['target']
submission6.columns = ['target']

In [33]:
submission3.to_csv('sub3.csv')
submission4.to_csv('sub4.csv')
submission5.to_csv('sub5.csv')
submission6.to_csv('sub6.csv')

# Output of this notebook scored a public score of 0.76892 (sub5)

So far, a CNN approach does not seem to beat the LSTM approach.