In [1]:
import pandas as pd
import numpy as np

In [4]:
train = pd.read_csv('Data/train_processed_20200525.csv', index_col='id')
test = pd.read_csv('Data/test_processed_20200525.csv', index_col='id')

# Split the Train Data

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x_train, x_test, y_train, y_test = train_test_split(train['text'], train['target'], random_state = 31)

# Tf-Idf Vectorizer

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [11]:
tfidf_vec = TfidfVectorizer(stop_words='english')
tfidf_vec.fit_transform(x_train.values.tolist() + x_test.values.tolist())
tfidf_train = tfidf_vec.transform(x_train.values.tolist())
tfidf_test = tfidf_vec.transform(x_test.values.tolist())

# Keep the TfIdf vector to use it with the real testing data afterwards

# Modeling

We will use accuracy as our KPI.

In [18]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

## Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
rf = RandomForestClassifier(random_state=31)

rf_params = {
    'max_depth': [4,5],
    'min_samples_leaf' : [10,20],
    'n_estimators': [100, 500, 1000]
}

rf_grid = GridSearchCV(estimator = rf, param_grid = rf_params,
                       scoring = 'accuracy', cv = 3, n_jobs = 1)

rf_grid.fit(tfidf_train, y_train)
pred = rf_grid.predict(tfidf_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.57      1.00      0.73      1061
           1       1.00      0.05      0.10       843

    accuracy                           0.58      1904
   macro avg       0.79      0.53      0.41      1904
weighted avg       0.76      0.58      0.45      1904



## Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

In [22]:
lg = LogisticRegression(random_state=31)

lg_params = {
    'C': [1.0, 0.1, 0.0001]
}

lg_grid = GridSearchCV(lg, lg_params, 'accuracy', 3, 1)
lg_grid.fit(tfidf_train, y_train)
pred = lg_grid.predict(tfidf_test)
print(classification_report(y_test, pred))



              precision    recall  f1-score   support

           0       0.76      0.93      0.83      1061
           1       0.87      0.63      0.73       843

    accuracy                           0.79      1904
   macro avg       0.82      0.78      0.78      1904
weighted avg       0.81      0.79      0.79      1904





Logistic Regression performs pretty well!

# Neural Network

In [24]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Dense, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [35]:
# Define the model
model = Sequential()
model.add(Dense(1024, input_dim=tfidf_train.shape[1]))
model.add(Activation('relu'))

model.add(Dense(1024))
model.add(Activation('relu'))

model.add(Dense(1024))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())
es = EarlyStopping(monitor='val_loss', patience=5)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 1024)              15392768  
_________________________________________________________________
activation_4 (Activation)    (None, 1024)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
activation_5 (Activation)    (None, 1024)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
activation_6 (Activation)    (None, 1024)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                

In [36]:
model.fit(tfidf_train, y_train, batch_size = 128, epochs = 10,
          validation_data=(tfidf_test, y_test), callbacks=[es])

Train on 5709 samples, validate on 1904 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.callbacks.callbacks.History at 0x1499b9050>

In [37]:
pred = model.predict_classes([tfidf_test], batch_size=1024, verbose = 1)



In [38]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.79      0.80      0.79      1061
           1       0.75      0.72      0.73       843

    accuracy                           0.77      1904
   macro avg       0.77      0.76      0.76      1904
weighted avg       0.77      0.77      0.77      1904



## ANN with Padding 

In [89]:
embed_size = 300 # size of each word vector
max_features = 5000 # number of unique words used (number of rows in an embedding vector)
max_len = 100 # maximum number of words in a question to use

# Tokenize the sentences
tok = Tokenizer(num_words=max_features)
tok.fit_on_texts(list(x_train))
x_train_token = tok.texts_to_sequences(x_train)
x_test_token = tok.texts_to_sequences(x_test)

In [90]:
# Pad the sentences
x_train_pad = pad_sequences(x_train_token, maxlen = max_len)
x_test_pad = pad_sequences(x_test_token, maxlen = max_len)

#### Without Pre-trained Embeddings

In [91]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(max_features, embedding_vector_length, input_length=max_len))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))

model.add(Dense(1024))
model.add(Activation('relu'))
model.add(Dropout(.2))

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
es = EarlyStopping(monitor='val_loss', patience=10)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 32)           160000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_12 (Dense)             (None, 1024)              103424    
_________________________________________________________________
activation_10 (Activation)   (None, 1024)              0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 1025      
Total params: 317,649
Trainable params: 317,649
Non-trainable params: 0
________________________________________________

In [55]:
model.fit(x_train_pad, y_train, batch_size=512, epochs = 10,
          validation_data=(x_test_pad, y_test), callbacks=[es])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 5709 samples, validate on 1904 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x141b12b10>

In [56]:
pred = model.predict_classes([x_test_pad], batch_size=256, verbose = 1)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.80      0.81      0.80      1061
           1       0.75      0.74      0.75       843

    accuracy                           0.78      1904
   macro avg       0.78      0.78      0.78      1904
weighted avg       0.78      0.78      0.78      1904



## Glove

In [57]:
glove_path = 'Models/glove.6B.100d.txt'
def get_coef(word, *arr):
    return word, np.asarray(arr, dtype = 'float32')

In [61]:
with open(glove_path) as path:
    embed_index = dict(get_coef(*i.split(' ')) for i in path)

In [63]:
# The embed_index comes in the form of a dictionary with a word
# as the key and a respective embedding vector as the value

# Use values
embed_values = np.stack(embed_index.values())

  if (await self.run_code(code, result,  async_=asy)):


In [65]:
embed_values.shape

# 400,000 words, each with 100 in length

(400000, 100)

In [67]:
word_index = tok.word_index
len(word_index)

13027

In [69]:
nb_words = min(max_features, len(word_index)) + 1

# Create an embedding matrix with a specific shape
# - no. of rows is nb_words, which is number of words, which should not be more than max_features.
# - no. of columns is the length of an embedding vector
# - the values inside are not necessary, it will be changed later
embed_matrix = np.random.normal(embed_values.mean(), embed_values.std(),
                                (nb_words, embed_values.shape[1]))

In [97]:
# Note that this will work only if max_features < len(word_index)
for word, i in word_index.items():
    if i >= max_features:
        continue
    embed_vector = embed_index.get(word)
    if embed_vector is not None:
        embed_matrix[i] = embed_vector

In [100]:
# Define the model
model = Sequential()

e = Embedding(len(embed_matrix), embed_values.shape[1], weights=[embed_matrix], input_length=max_len, trainable=False)
model.add(e)
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [101]:
model.fit(x_train_pad, y_train, batch_size = 128, epochs = 10,
          validation_data=(x_test_pad, y_test), callbacks=[es])

pred = model.predict_classes([x_test_pad], batch_size=256, verbose = 1)

Train on 5709 samples, validate on 1904 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [103]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.77      0.88      0.82      1061
           1       0.82      0.67      0.74       843

    accuracy                           0.79      1904
   macro avg       0.80      0.78      0.78      1904
weighted avg       0.79      0.79      0.79      1904



We can see that the last model is the best.<br><br>
Use the last one for prediction

# Prediction

In [108]:
embed_size = 300 # size of each word vector
max_features = 5000 # number of unique words used (number of rows in an embedding vector)
max_len = 100 # maximum number of words in a question to use

# Tokenize the sentences
tok = Tokenizer(num_words=max_features)
tok.fit_on_texts(list(train['text']))
x_token = tok.texts_to_sequences(train['text'])
valid_token = tok.texts_to_sequences(test['text'])

# Target variable for training
y = train['target']

In [109]:
# Pad the sentences
x_pad = pad_sequences(x_token, maxlen = max_len)
valid_pad = pad_sequences(valid_token, maxlen = max_len)

In [110]:
word_index = tok.word_index
len(word_index)

15676

In [111]:
nb_words = min(max_features, len(word_index)) + 1

# Create an embedding matrix with a specific shape
# - no. of rows is nb_words, which is number of words, which should not be more than max_features.
# - no. of columns is the length of an embedding vector
# - the values inside are not necessary, it will be changed later
embed_matrix = np.random.normal(embed_values.mean(), embed_values.std(),
                                (nb_words, embed_values.shape[1]))

In [112]:
# Note that this will work only if max_features < len(word_index)
for word, i in word_index.items():
    if i >= max_features:
        continue
    embed_vector = embed_index.get(word)
    if embed_vector is not None:
        embed_matrix[i] = embed_vector

In [113]:
# Define the model
model = Sequential()

e = Embedding(len(embed_matrix), embed_values.shape[1], weights=[embed_matrix], input_length=max_len, trainable=False)
model.add(e)
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [114]:
model.fit(x_pad, y, batch_size = 128, epochs = 10, callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [115]:
pred = model.predict_classes([valid_pad], batch_size=256, verbose = 1)



# Submission

In [117]:
sub = pd.read_csv('Data/submission.csv')
sub.target = pred
sub.to_csv('Answer/RNN_20200525.csv', index = False)