In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [None]:
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')


In [None]:
train_input = list(train_data['question_text'])
train_label = list(train_data['target'])

test_input = list(test_data['question_text'])

In [None]:
#remove stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop=stopwords.words('english') 

def remove_stop_words(x):
    for word in stop:
        token = " " + word + " "
        if (x.find(token) != -1):
            x = x.replace(token, " ")
    return x

train_input_rsw = list(map(remove_stop_words, train_input))
test_input_rsw = list(map(remove_stop_words, test_input))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
max_features=100000
embed_size = 300
max_length = 60 

In [None]:

embeddings=open("./data/embeddings/glove.840B.300d/glove.840B.300d.txt", 'r')


In [None]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in embeddings)

In [None]:
tokenizer=Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(train_input_rsw)

word_index = tokenizer.word_index
n_words=min(max_features,len(word_index))

embedding_matrix = np.zeros((n_words+1, 300))

for word, i in word_index.items():
    if i >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [None]:
sequences = tokenizer.texts_to_sequences(train_input_rsw)

train_input_padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
print(train_input_padded.shape)

sequences = tokenizer.texts_to_sequences(test_input_rsw)
test_input_padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
print(test_input_padded.shape)

(1306122, 60)
(375806, 60)


In [None]:
train_text, cv_text, train_target, cv_target = train_test_split(train_input_padded, train_label, test_size = 0.1, random_state=2)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding,GRU,Dropout,Conv1D,MaxPooling1D,Dense

In [None]:
gru_model=Sequential()
gru_model.add(Embedding(n_words+1,300,input_length=max_length,weights=[embedding_matrix], trainable=False))
gru_model.add(GRU(256,return_sequences=True))
gru_model.add(Dropout(0.2))
gru_model.add(Conv1D(100,5,activation='relu'))
gru_model.add(MaxPooling1D(pool_size=4))
gru_model.add(Dropout(0.4))
gru_model.add(Dense(1,activation='sigmoid'))

gru_model.summary()

gru_model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['acc'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 60, 300)           30000300  
_________________________________________________________________
bidirectional (Bidirectional (None, 60, 512)           1140736   
_________________________________________________________________
dropout (Dropout)            (None, 60, 512)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 56, 100)           256100    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 14, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0

In [None]:
del embeddings_index
import gc
gc.collect()

7569

In [None]:
history=gru_model.fit(np.array(train_text), np.array(train_target), epochs = 5, validation_data=(np.array(cv_text),np.array(cv_target)), batch_size=1024,verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
from sklearn.metrics import f1_score
cv_predictions = gru_model.predict(cv_text, batch_size=512)

thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    result = f1_score(cv_target, (cv_predictions>thresh).astype(int))
    thresholds.append([thresh, result])
    print("F1 score at threshold {} is {}".format(thresh, result))

thresholds.sort(key=lambda x: x[1], reverse=True)
print("Best value {}".format(thresholds[0]))
best_thresh = thresholds[0]

F1 score at threshold 0.1 is 0.6103786567230435
F1 score at threshold 0.11 is 0.6148435386418974
F1 score at threshold 0.12 is 0.6206266557047593
F1 score at threshold 0.13 is 0.6235962844863441
F1 score at threshold 0.14 is 0.6268405553218342
F1 score at threshold 0.15 is 0.6304245060035927
F1 score at threshold 0.16 is 0.6340110021525951
F1 score at threshold 0.17 is 0.6373084545850052
F1 score at threshold 0.18 is 0.6400351614005958
F1 score at threshold 0.19 is 0.6438288643533122
F1 score at threshold 0.2 is 0.6468715806226997
F1 score at threshold 0.21 is 0.6489879965848024
F1 score at threshold 0.22 is 0.6503801317790168
F1 score at threshold 0.23 is 0.6517751934013013
F1 score at threshold 0.24 is 0.6538600942370424
F1 score at threshold 0.25 is 0.6555933979565104
F1 score at threshold 0.26 is 0.6562466885662817
F1 score at threshold 0.27 is 0.6580017115960635
F1 score at threshold 0.28 is 0.6590626520352453
F1 score at threshold 0.29 is 0.6603403141361256
F1 score at threshold 