In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='whitegrid',palette='muted',font_scale=1.5)
from pylab import rcParams
rcParams['figure.figsize'] = 16,10 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model

In [2]:
raw = pd.read_csv('train.csv',index_col='id')
raw = raw.drop(['keyword','location'], axis=1)
data = raw['text']
y = raw['target'].values

### Deleting shenanigans

In [3]:
import re

#remove url
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

In [4]:
data = data.apply(lambda x: remove_url(x))

In [5]:
#remove html
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

In [6]:
data = data.apply(lambda x: remove_html(x))

In [7]:
#remove f...ing emojis, had to google this
def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F" #emoticons
                               u"\U0001F300-\U0001F5FF" #symbols&pics
                               u"\U0001F680-\U0001F6FF" #transportation pic
                               u"\U0001F1E0-\U0001F1FF" #flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"    
                               "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [8]:
data = data.apply(lambda x: remove_emojis(x))

In [9]:
#remove punctuation
import string
def remove_punct(text):
    signs = str.maketrans('', '', string.punctuation)
    return text.translate(signs)

In [10]:
data = data.apply(lambda x: remove_punct(x))

### Formatting

In [11]:
MAX_VOCAB_SIZE = 25000

In [12]:
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(data.values)
X_train = tokenizer.texts_to_sequences(data.values)

In [13]:
wordidx = tokenizer.word_index
Vlength = len(wordidx)
Vlength #unique tokens

18104

In [14]:
X_train = pad_sequences(X_train)
y_train = y.copy()
seq_length = X_train.shape[1]
print(X_train.shape, y_train.shape)

(7613, 31) (7613,)


## Tweaking the CNN model
The goal is to get closer to t1 of .85

In [30]:
def train_model(D, X = X_train, y=y_train):
    i = Input(shape=(seq_length,))
    x = Embedding(Vlength+1, D)(i)
    x = Conv1D(16, 2, activation = 'relu')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(32, 2, activation = 'relu')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(64, 2, activation = 'relu')(x)
#    x = MaxPooling1D(1)(x)
#    x = Conv1D(128, 1, activation = 'relu')(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(1, activation = 'sigmoid')(x)

    model = Model(i,x)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    ES = keras.callbacks.EarlyStopping(monitor = 'val_accuracy', patience = 2, restore_best_weights = True)
    model.fit(X, y, epochs = 20, callbacks = [ES], validation_split = 0.1, verbose = 2)
    return model

In [31]:
model = train_model(20)

Epoch 1/20
215/215 - 1s - loss: 0.6058 - accuracy: 0.6659 - val_loss: 0.4971 - val_accuracy: 0.7585
Epoch 2/20
215/215 - 1s - loss: 0.3419 - accuracy: 0.8575 - val_loss: 0.4659 - val_accuracy: 0.7822
Epoch 3/20
215/215 - 1s - loss: 0.1776 - accuracy: 0.9365 - val_loss: 0.5212 - val_accuracy: 0.7835
Epoch 4/20
215/215 - 1s - loss: 0.1074 - accuracy: 0.9641 - val_loss: 0.6059 - val_accuracy: 0.7533
Epoch 5/20
215/215 - 1s - loss: 0.0785 - accuracy: 0.9746 - val_loss: 0.6339 - val_accuracy: 0.7717


In [32]:
model2 = train_model(128)

Epoch 1/20
215/215 - 6s - loss: 0.5952 - accuracy: 0.6706 - val_loss: 0.4656 - val_accuracy: 0.7808
Epoch 2/20
215/215 - 6s - loss: 0.3298 - accuracy: 0.8675 - val_loss: 0.4503 - val_accuracy: 0.7992
Epoch 3/20
215/215 - 6s - loss: 0.1587 - accuracy: 0.9441 - val_loss: 0.5275 - val_accuracy: 0.7808
Epoch 4/20
215/215 - 6s - loss: 0.0889 - accuracy: 0.9705 - val_loss: 0.5912 - val_accuracy: 0.7533


### Filtering the texts and getting rid of all the redundant stuff actually pushes the CNN an extra 0.03

Will this lead to better performance with the test set?

In [33]:
def pipeline(data):
    data = data.apply(lambda x: remove_url(x))
    data = data.apply(lambda x: remove_html(x))
    data = data.apply(lambda x: remove_emojis(x))
    data = data.apply(lambda x: remove_punct(x))
    return data

In [39]:
raw_test = pd.read_csv('test.csv', index_col='id')
raw_test = raw_test.drop(['keyword','location'], axis=1)
test_data = raw_test['text']

In [40]:
test_data = pipeline(test_data)

In [41]:
X_test = tokenizer.texts_to_sequences(test_data.values)
X_test = pad_sequences(X_test, maxlen = seq_length)

In [42]:
pred7 = [1 if n >= 0.5 else 0 for n in model.predict(X_test)]
pred8 = [1 if n >= 0.5 else 0 for n in model2.predict(X_test)]

In [43]:
submission7 = pd.DataFrame(index = raw_test.index, data = pred7)
submission8 = pd.DataFrame(index = raw_test.index, data = pred8)
submission7.columns = ['target']
submission8.columns = ['target']
submission7.to_csv('sub7.csv')
submission8.to_csv('sub8.csv')

## 0.79589!
Scored by sub8

Definitely an improvement for CNN

### Small experiment with thresholds

In [44]:
pred_exp = [1 if n >= 0.6 else 0 for n in model2.predict(X_test)]
submission9 = pd.DataFrame(index = raw_test.index, data = pred_exp)
submission9.columns = ['target']
submission9.to_csv('sub9.csv')

### 0.79160 
Fooling around with the threshold value might be a way to improve this particular model...