In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')


In [None]:
train_input = list(train_data['question_text'])
train_label = list(train_data['target'])

test_input = list(test_data['question_text'])

In [None]:
#remove stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop=stopwords.words('english') 

def remove_stop_words(x):
    for word in stop:
        token = " " + word + " "
        if (x.find(token) != -1):
            x = x.replace(token, " ")
    return x

train_input_rsw = list(map(remove_stop_words, train_input))
test_input_rsw = list(map(remove_stop_words, test_input))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
max_features=100000
embed_size = 300
max_length = 60 

In [None]:

embeddings=open("./data/embeddings/glove.840B.300d/glove.840B.300d.txt", 'r')


In [None]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in embeddings)

In [None]:
print(type(embeddings_index['this'][1]))

<class 'numpy.float32'>


In [None]:
tokenizer=Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(train_input_rsw)

word_index = tokenizer.word_index
n_words=min(max_features,len(word_index))

embedding_matrix = np.zeros((n_words+1, 300))

for word, i in word_index.items():
    if i >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [None]:
sequences = tokenizer.texts_to_sequences(train_input_rsw)

train_input_padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
print(train_input_padded.shape)

sequences = tokenizer.texts_to_sequences(test_input_rsw)
test_input_padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
print(test_input_padded.shape)

(1306122, 60)
(375806, 60)


In [None]:
train_text, cv_text, train_target, cv_target = train_test_split(train_input_padded, train_label, test_size = 0.01, random_state=2)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense,MaxPooling1D

In [None]:
embedding_dim = 100

cnn = Sequential()
cnn.add(Embedding(n_words+1,embed_size,input_length=max_length,weights=[embedding_matrix], trainable=False))
cnn.add(Conv1D(128, 5, activation='relu'))
cnn.add(MaxPooling1D(5))
cnn.add(Conv1D(128, 5, activation='relu'))
cnn.add(GlobalMaxPooling1D())
cnn.add(Dense(10, activation='relu'))
cnn.add(Dense(1, activation='sigmoid'))
cnn.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
cnn.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 60, 300)           30000300  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 56, 128)           192128    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 11, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 7, 128)            82048     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                1290      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                

In [None]:
del embeddings_index
import gc
gc.collect()

748

In [None]:
history=cnn.fit(np.array(train_text), np.array(train_target), epochs = 5, validation_data=(np.array(cv_text),np.array(cv_target)), batch_size=1024,verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
from sklearn.metrics import f1_score
cv_predictions = cnn.predict(cv_text, batch_size=512)

thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    result = f1_score(cv_target, (cv_predictions>thresh).astype(int))
    thresholds.append([thresh, result])
    print("F1 score at threshold {} is {}".format(thresh, result))

thresholds.sort(key=lambda x: x[1], reverse=True)
print("Best value {}".format(thresholds[0]))
best_thresh = thresholds[0]

F1 score at threshold 0.1 is 0.5588499550763703
F1 score at threshold 0.11 is 0.5660550458715596
F1 score at threshold 0.12 is 0.5703564727954972
F1 score at threshold 0.13 is 0.5780952380952381
F1 score at threshold 0.14 is 0.5852713178294574
F1 score at threshold 0.15 is 0.5899705014749262
F1 score at threshold 0.16 is 0.5949177877428999
F1 score at threshold 0.17 is 0.5969696969696969
F1 score at threshold 0.18 is 0.6003078501795792
F1 score at threshold 0.19 is 0.6020725388601037
F1 score at threshold 0.2 is 0.6018907563025211
F1 score at threshold 0.21 is 0.6055437100213221
F1 score at threshold 0.22 is 0.6066810344827586
F1 score at threshold 0.23 is 0.6095030038230476
F1 score at threshold 0.24 is 0.6145313366611204
F1 score at threshold 0.25 is 0.6167597765363128
F1 score at threshold 0.26 is 0.6148610323312536
F1 score at threshold 0.27 is 0.6174112256586483
F1 score at threshold 0.28 is 0.6176980913823019
F1 score at threshold 0.29 is 0.6214160327677004
F1 score at threshold 