In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='whitegrid',palette='muted',font_scale=1.5)
from pylab import rcParams
rcParams['figure.figsize'] = 16,10 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import LSTM, Embedding
from tensorflow.keras.models import Model

In [2]:
raw = pd.read_csv('train.csv',index_col='id')
raw = raw.drop(['keyword','location'], axis=1)
data = raw['text']
y = raw['target'].values

### Deleting shenanigans

In [3]:
import re

#remove url
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

In [4]:
data = data.apply(lambda x: remove_url(x))

In [5]:
#remove html
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

In [6]:
data = data.apply(lambda x: remove_html(x))

In [7]:
#remove f...ing emojis, had to google this
def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F" #emoticons
                               u"\U0001F300-\U0001F5FF" #symbols&pics
                               u"\U0001F680-\U0001F6FF" #transportation pic
                               u"\U0001F1E0-\U0001F1FF" #flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"    
                               "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [8]:
data = data.apply(lambda x: remove_emojis(x))

In [9]:
#remove punctuation
import string
def remove_punct(text):
    signs = str.maketrans('', '', string.punctuation)
    return text.translate(signs)

In [10]:
data = data.apply(lambda x: remove_punct(x))

### Formatting

In [11]:
MAX_VOCAB_SIZE = 25000

In [12]:
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(data.values)
X_train = tokenizer.texts_to_sequences(data.values)

In [13]:
wordidx = tokenizer.word_index
Vlength = len(wordidx)
Vlength #unique tokens

18104

In [14]:
X_train = pad_sequences(X_train)
y_train = y.copy()
seq_length = X_train.shape[1]
print(X_train.shape, y_train.shape)

(7613, 31) (7613,)


## Tweaking the RNN model
The goal is to get closer to t1 of .85

In [17]:
def train_model(D, X = X_train, y=y_train):
    i = Input(shape=(seq_length,))
    x = Embedding(Vlength+1,D)(i)
    x = keras.layers.Bidirectional(LSTM(D, return_sequences = True, recurrent_dropout = 0.3))(x)
    x = keras.layers.Bidirectional(LSTM(D, return_sequences = True, recurrent_dropout = 0.3))(x)
    x = GlobalMaxPooling1D()(x)
#    x = Dense(128, activation = 'relu')(x)
#    x = keras.layers.Dropout(0.5)(x)
#    x = Dense(64, activation = 'relu')(x)
#    x = keras.layers.Dropout(0.5)(x)
    x = Dense(1, activation = 'sigmoid')(x)
    adam = keras.optimizers.Adam(lr=0.001)
    model = Model(i,x)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    ES = keras.callbacks.EarlyStopping(monitor = 'val_accuracy', patience = 2, restore_best_weights = False)
    history = model.fit(X, y, epochs = 20, callbacks = [ES], validation_split = 0.1, verbose = 2)
    return model

In [16]:
model = train_model(20)

Epoch 1/20
215/215 - 9s - loss: 0.5837 - accuracy: 0.6914 - val_loss: 0.4407 - val_accuracy: 0.7966
Epoch 2/20
215/215 - 8s - loss: 0.3413 - accuracy: 0.8672 - val_loss: 0.4669 - val_accuracy: 0.7730
Epoch 3/20
215/215 - 8s - loss: 0.2002 - accuracy: 0.9349 - val_loss: 0.5188 - val_accuracy: 0.7874


In [18]:
model1 = train_model(20)

Epoch 1/20
215/215 - 8s - loss: 0.5676 - accuracy: 0.6941 - val_loss: 0.4454 - val_accuracy: 0.7992
Epoch 2/20
215/215 - 8s - loss: 0.3282 - accuracy: 0.8702 - val_loss: 0.4481 - val_accuracy: 0.7913
Epoch 3/20
215/215 - 8s - loss: 0.1962 - accuracy: 0.9323 - val_loss: 0.4968 - val_accuracy: 0.7782


In [19]:
model2 = train_model(128)

Epoch 1/20
215/215 - 31s - loss: 0.5175 - accuracy: 0.7454 - val_loss: 0.4387 - val_accuracy: 0.8031
Epoch 2/20
215/215 - 30s - loss: 0.2832 - accuracy: 0.8908 - val_loss: 0.4905 - val_accuracy: 0.7441
Epoch 3/20
215/215 - 30s - loss: 0.1626 - accuracy: 0.9463 - val_loss: 0.5503 - val_accuracy: 0.7638


### Filtering the texts and getting rid of all the redundant stuff does not really help in this case.

..or does it?

In [21]:
def pipeline(data):
    data = data.apply(lambda x: remove_url(x))
    data = data.apply(lambda x: remove_html(x))
    data = data.apply(lambda x: remove_emojis(x))
    data = data.apply(lambda x: remove_punct(x))
    return data

In [22]:
raw_test = pd.read_csv('test.csv', index_col='id')
raw_test = raw_test.drop(['keyword','location'], axis=1)
test_data = raw_test['text']

In [23]:
test_data = pipeline(test_data)

In [24]:
X_test = tokenizer.texts_to_sequences(test_data.values)
X_test = pad_sequences(X_test, maxlen = seq_length)

In [25]:
pred10 = [1 if n >= 0.5 else 0 for n in model1.predict(X_test)]
submission10 = pd.DataFrame(index = raw_test.index, data = pred10)
submission10.columns = ['target']
submission10.to_csv('sub10.csv')

## Meh, 0.77321
what about threshold movement?

In [26]:
pred_exp = [1 if n >= 0.6 else 0 for n in model1.predict(X_test)]
submission11 = pd.DataFrame(index = raw_test.index, data = pred_exp)
submission11.columns = ['target']
submission11.to_csv('sub11.csv')

## Slightly less meh, 0.77750