In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
train_data = pd.read_csv('/content/drive/Shareddrives/CS 247 project/data/splited_train.csv')
test_data = pd.read_csv('/content/drive/Shareddrives/CS 247 project/data/splited_test.csv')


In [6]:
train_input = list(train_data['question_text'])
train_label = list(train_data['target'])

test_input = list(test_data['question_text'])

In [9]:

#remove all the stop words for the 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop=stopwords.words('english') 

def remove_stop_words(x):
    for word in stop:
        token = " " + word + " "
        if (x.find(token) != -1):
            x = x.replace(token, " ")
    return x

train_input_rsw = list(map(remove_stop_words, train_input))
test_input_rsw = list(map(remove_stop_words, test_input))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
max_features=100000
embed_size = 300 
max_length = 60 

In [11]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = {}
with open("/content/drive/Shareddrives/CS 247 project/data/embeddings/glove.840B.300d/glove.840B.300d.txt",'r') as f:
    for line in f:
        
        word,coefs=get_coefs(*line.split(" "))
        #coefs = np.asarray(coefs, dtype='float32')
        embeddings_index[word] = coefs

In [12]:
tokenizer=Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(train_input_rsw)

word_index = tokenizer.word_index
n_words=min(max_features,len(word_index))

embedding_matrix = np.zeros((n_words+1, embed_size))

for word, i in word_index.items():
    if i >= max_features: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [13]:
sequences = tokenizer.texts_to_sequences(train_input_rsw)

train_input_padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
print(train_input_padded.shape)

sequences = tokenizer.texts_to_sequences(test_input_rsw)
test_input_padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
print(test_input_padded.shape)

(1044897, 60)
(261225, 60)


In [14]:
train_text, cv_text, train_target, cv_target = train_test_split(train_input_padded, train_label, test_size = 0.1, random_state=2)

In [15]:
from keras.models import Sequential
from keras.layers import Embedding,Bidirectional,LSTM,Dropout,Conv1D,MaxPooling1D,Dense,GRU

In [16]:
gru=Sequential()
gru.add(Embedding(n_words+1,embed_size,input_length=max_length,weights=[embedding_matrix], trainable=False))
gru.add(Bidirectional(LSTM(256,return_sequences=True)))
gru.add(Dropout(0.2))
gru.add(Conv1D(100,5,activation='relu'))
gru.add(MaxPooling1D(pool_size=4))
gru.add(Bidirectional(GRU(128)))
gru.add(Dropout(0.4))
gru.add(Dense(1,activation='sigmoid'))

gru.summary()

gru.compile(loss='binary_crossentropy',optimizer='adam', metrics=['acc'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 60, 300)           30000300  
_________________________________________________________________
bidirectional (Bidirectional (None, 60, 512)           1140736   
_________________________________________________________________
dropout (Dropout)            (None, 60, 512)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 56, 100)           256100    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 14, 100)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               176640    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0

In [17]:
# del embeddings_index
# import gc
# gc.collect()

In [18]:
history=gru.fit(np.array(train_text), np.array(train_target), epochs = 5, validation_data=(np.array(cv_text),np.array(cv_target)), batch_size=1024,verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
from sklearn.metrics import f1_score
cv_predictions = gru.predict(cv_text, batch_size=512)

thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    result = f1_score(cv_target, (cv_predictions>thresh).astype(int))
    thresholds.append([thresh, result])
    print("F1 score at threshold {} is {}".format(thresh, result))

thresholds.sort(key=lambda x: x[1], reverse=True)
print("Best value {}".format(thresholds[0]))
best_thresh = thresholds[0]

F1 score at threshold 0.1 is 0.6222097272982849
F1 score at threshold 0.11 is 0.6284666513866606
F1 score at threshold 0.12 is 0.6337429386756741
F1 score at threshold 0.13 is 0.6374327758406714
F1 score at threshold 0.14 is 0.6413735142273983
F1 score at threshold 0.15 is 0.6452359208523593
F1 score at threshold 0.16 is 0.6475955610357583
F1 score at threshold 0.17 is 0.650302916744738
F1 score at threshold 0.18 is 0.6523908786558019
F1 score at threshold 0.19 is 0.6549677748707804
F1 score at threshold 0.2 is 0.6565259426361586
F1 score at threshold 0.21 is 0.6584523111024186
F1 score at threshold 0.22 is 0.6602069738316524
F1 score at threshold 0.23 is 0.6617813603357537
F1 score at threshold 0.24 is 0.6639730639730639
F1 score at threshold 0.25 is 0.6649901447699312
F1 score at threshold 0.26 is 0.6650212532565474
F1 score at threshold 0.27 is 0.664917127071823
F1 score at threshold 0.28 is 0.6662026726057906
F1 score at threshold 0.29 is 0.6662452591656132
F1 score at threshold 0.

In [20]:
#run on test set with best threshold
total_predictions=gru.predict(test_input_padded, batch_size=512)
test_labels=list(test_data['target'])
predictions1 = (total_predictions>best_thresh).astype(int)
res=f1_score(test_labels, predictions1[:,0])
print(res)

0.6633319602189136
