In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv('./data/splited_train.csv')
test_data = pd.read_csv('./data/splited_test.csv')


In [3]:
train_input = list(train_data['question_text'])
train_label = list(train_data['target'])

test_input = list(test_data['question_text'])

In [4]:

#remove all the stop words for the 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop=stopwords.words('english') 

def remove_stop_words(x):
    for word in stop:
        token = " " + word + " "
        if (x.find(token) != -1):
            x = x.replace(token, " ")
    return x

# train_input_rsw = list(map(remove_stop_words, train_input))
# test_input_rsw = list(map(remove_stop_words, test_input))
train_input_rsw=train_input
test_input_rsw =test_input

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
max_features=100000
embed_size = 300 
max_length = 70 

In [6]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = {}
with open("./data/embeddings/glove.840B.300d/glove.840B.300d.txt",'r') as f:
    for line in f:
        
        word,coefs=get_coefs(*line.split(" "))
        #coefs = np.asarray(coefs, dtype='float32')
        embeddings_index[word] = coefs

In [7]:
tokenizer=Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(train_input_rsw)

word_index = tokenizer.word_index
n_words=min(max_features,len(word_index))

embedding_matrix = np.zeros((n_words+1, embed_size))

for word, i in word_index.items():
    if i >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [8]:
sequences = tokenizer.texts_to_sequences(train_input_rsw)

train_input_padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
print(train_input_padded.shape)

sequences = tokenizer.texts_to_sequences(test_input_rsw)
test_input_padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
print(test_input_padded.shape)

(1044897, 70)
(261225, 70)


In [9]:
train_text, cv_text, train_target, cv_target = train_test_split(train_input_padded, train_label, test_size = 0.1, random_state=2)

In [10]:
from keras.models import Sequential
from keras.layers import Embedding,Bidirectional,LSTM,Dropout,Conv1D,MaxPooling1D,Dense

In [11]:

#just lstm with 
lstm=Sequential()
lstm.add(Embedding(n_words+1,embed_size,input_length=max_length,weights=[embedding_matrix], trainable=False))
lstm.add(LSTM(256,return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(Conv1D(100,5,activation='relu'))
lstm.add(MaxPooling1D(pool_size=4))
lstm.add(LSTM(128))
lstm.add(Dropout(0.4))
lstm.add(Dense(1,activation='sigmoid'))

lstm.summary()

lstm.compile(loss='binary_crossentropy',optimizer='adam', metrics=['acc'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 70, 300)           30000300  
_________________________________________________________________
lstm (LSTM)                  (None, 70, 256)           570368    
_________________________________________________________________
dropout (Dropout)            (None, 70, 256)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 66, 100)           128100    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 16, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0

In [12]:
# del embeddings_index
# import gc
# gc.collect()

In [13]:
history=lstm.fit(np.array(train_text), np.array(train_target), epochs = 5, validation_data=(np.array(cv_text),np.array(cv_target)), batch_size=1024,verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
from sklearn.metrics import f1_score
cv_predictions = lstm.predict(cv_text, batch_size=512)

thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    result = f1_score(cv_target, (cv_predictions>thresh).astype(int))
    thresholds.append([thresh, result])
    print("F1 score at threshold {} is {}".format(thresh, result))

thresholds.sort(key=lambda x: x[1], reverse=True)
print("Best value {}".format(thresholds[0]))
best_thresh = thresholds[0]

F1 score at threshold 0.1 is 0.592470854090699
F1 score at threshold 0.11 is 0.6003453869904234
F1 score at threshold 0.12 is 0.6092297831428266
F1 score at threshold 0.13 is 0.6162291428882005
F1 score at threshold 0.14 is 0.6223154640313586
F1 score at threshold 0.15 is 0.6285906642728905
F1 score at threshold 0.16 is 0.6329675328367544
F1 score at threshold 0.17 is 0.6374892024186583
F1 score at threshold 0.18 is 0.6413334887516028
F1 score at threshold 0.19 is 0.6426847218133647
F1 score at threshold 0.2 is 0.6457217660359397
F1 score at threshold 0.21 is 0.6490225563909774
F1 score at threshold 0.22 is 0.651505016722408
F1 score at threshold 0.23 is 0.6548204623708804
F1 score at threshold 0.24 is 0.65647906572245
F1 score at threshold 0.25 is 0.6590139254798644
F1 score at threshold 0.26 is 0.6612974824021816
F1 score at threshold 0.27 is 0.6644390181210642
F1 score at threshold 0.28 is 0.6663628670008463
F1 score at threshold 0.29 is 0.6698324759266587
F1 score at threshold 0.3 

In [15]:
#run on test set with best threshold
total_predictions=lstm.predict(test_input_padded, batch_size=512)
test_labels=list(test_data['target'])
predictions1 = (total_predictions>best_thresh).astype(int)
res=f1_score(test_labels, predictions1[:,0])
print(res)

0.6747002398081535
