In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/quora-insincere-questions-classification/sample_submission.csv
/kaggle/input/quora-insincere-questions-classification/train.csv
/kaggle/input/quora-insincere-questions-classification/test.csv
/kaggle/input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/paragram_300_sl999.txt
/kaggle/input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/README.txt
/kaggle/input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt
/kaggle/input/quora-insincere-questions-classification/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec
/kaggle/input/quora-insincere-questions-classification/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin


# ****AIM: Quora Insincere Question Classification****

## Read Data

In [2]:
pd.set_option('display.max_colwidth', -1)

In [3]:
train= pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province as a nation in the 1960s?,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you encourage people to adopt and not shop?",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity affect space geometry?,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg hemispheres?,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain bike by just changing the tyres?,0


In [4]:
train['target'].value_counts()

0    1225312
1    80810  
Name: target, dtype: int64

The training dataset has 80810 insincere question examples and 1225312 sincere question examples

In [5]:
test = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')
test.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arrogant when they get just a little bit of wealth and power?
1,00002bd4fb5d505b9161,When should I apply for RV college of engineering and BMS college of engineering? Should I wait for the COMEDK result or am I supposed to apply before the result?
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitioner?
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [6]:
# Sincere Question Examples
train[train['target'] == 0].head(5)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province as a nation in the 1960s?,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you encourage people to adopt and not shop?",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity affect space geometry?,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg hemispheres?,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain bike by just changing the tyres?,0


In [7]:
# Insincere Question examples
train[train['target'] == 1].head(5)

Unnamed: 0,qid,question_text,target
22,0000e91571b60c2fb487,Has the United States become the largest dictatorship in the world?,1
30,00013ceca3f624b09f42,Which babies are more sweeter to their parents? Dark skin babies or light skin babies?,1
110,0004a7fcb2bf73076489,If blacks support school choice and mandatory sentencing for criminals why don't they vote Republican?,1
114,00052793eaa287aff1e1,"I am gay boy and I love my cousin (boy). He is sexy, but I dont know what to do. He is hot, and I want to see his di**. What should I do?",1
115,000537213b01fd77b58a,Which races have the smallest penis?,1


In [8]:
#Check for missing data
print("Number of missing values in question text are " + str(train['question_text'].isnull().sum()))
print("Number of missing values in target are " + str(train['target'].isnull().sum()))

Number of missing values in question text are 0
Number of missing values in target are 0


In [9]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


## Data Preprocessing

In [10]:
questions_train = train['question_text'].values
y_train = train['target'].values
questions_test = test['question_text'].values

 **Keras Tokenizer:** Vectorize the text corpus. It turns text into vectors containing integers. Each integer is the index of the token in the dictionary. The parameter num of words only keeps the n most frequent words in the dictonary. This also removes all punctuation.

In [11]:
tokenizer = Tokenizer(num_words=90000)
tokenizer.fit_on_texts(questions_train)

In [12]:
print(list(tokenizer.word_counts.items())[:5])

[('how', 289929), ('did', 41715), ('quebec', 166), ('nationalists', 148), ('see', 9689)]


In [13]:
print(tokenizer.document_count)

1306122


In [14]:
print(list(tokenizer.word_index.items())[:15])

[('the', 1), ('what', 2), ('is', 3), ('a', 4), ('to', 5), ('in', 6), ('of', 7), ('i', 8), ('how', 9), ('and', 10), ('do', 11), ('are', 12), ('for', 13), ('you', 14), ('can', 15)]


In [15]:
X_train = tokenizer.texts_to_sequences(questions_train)
X_test = tokenizer.texts_to_sequences(questions_test)

In [16]:
print(train.iloc[0,:])
print(X_train[0])

qid              00002165364db923c7e6                                                    
question_text    How did Quebec nationalists see their province as a nation in the 1960s?
target           0                                                                       
Name: 0, dtype: object
[9, 48, 6683, 7219, 158, 55, 6107, 36, 4, 1206, 6, 1, 8333]


In [17]:
vocab_size = len(tokenizer.word_index) + 1

**Pad Sequences: ** Used to pad sequences. Pads sequences shorter than max_len and truncates longer sequences

In [18]:
maxlen = 150
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [19]:
#y_train = to_categorical(y_train)

In [20]:
from keras.models import Sequential
from keras import layers

**Embedding Matrix: ** All words map to some dimensional space say 300 and that is called the embedding of that word. An embedding matrix is a list of all words and their corresponding embeddings.


In [21]:
file = '/kaggle/input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt'
with open(file,errors='ignore', encoding='utf8') as f:
    firstNlines=f.readlines()[0:30]
print(firstNlines[25].split()[0])
print(np.array(firstNlines[25].split()[1:],dtype=np.float32).shape)
print(np.array(firstNlines[25].split()[1:],dtype=np.float32))

at
(300,)
[-3.6769e-01  5.9821e-01  1.3229e-01  2.3506e-01 -4.6757e-02  3.6307e-01
  1.4099e-01 -1.0093e-01 -1.5877e-01  2.5371e+00  1.6426e-01 -1.2201e-01
  2.1931e-01 -5.9079e-01  1.1269e-01 -6.1433e-02 -4.1994e-01  1.4899e+00
 -3.2112e-01 -8.5470e-02 -4.1711e-02 -5.4624e-02 -6.7566e-02  1.6858e-01
  2.9859e-01  6.1769e-01 -2.9285e-01 -2.7140e-01  2.9830e-01 -8.0828e-03
  4.5882e-01  7.4601e-02  1.6837e-01  8.5413e-01 -4.9983e-01 -1.2393e-01
 -2.8600e-01  8.7042e-01  1.8725e-01  5.3559e-01 -3.1930e-01  4.1699e-02
 -4.4677e-01  2.0253e-01  5.4033e-01  2.4753e-01 -3.7715e-01 -4.6027e-01
 -1.7242e-01  2.5339e-01  4.1274e-01  5.5843e-01 -1.6288e-01  1.5783e-01
 -1.5163e-01  9.6857e-02  4.6102e-01 -3.4735e-01 -2.4561e-01 -1.3195e-02
  1.4734e-01 -1.2283e-01 -1.1006e-01  1.2155e-01 -3.6222e-01 -2.1340e-01
  4.4930e-02 -1.3221e-02 -2.2636e-01  2.7993e-01  1.2472e-03 -5.0401e-01
 -7.3358e-02  1.5364e-01  7.3376e-02  4.7427e-01  5.1482e-01 -3.8231e-01
 -8.5774e-01 -2.7755e-01  3.2111e-01  1.9

In [22]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath,errors = 'ignore', encoding='utf8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                try:
                    embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]
                except:
                    continue

    return embedding_matrix

In [23]:
embedding_dim = 300
embedding_matrix = create_embedding_matrix ('/kaggle/input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt',tokenizer.word_index, embedding_dim)

In [24]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, shuffle= True)

## Model Training

**Embedding Layer**: Can be used to learn embedding with the model or it can be used to load a load a pre-trained word embedding. It is defined as the first hidden layer of the network. 

Parameters: 
input_dim: This is the vocabulary size.
output_dim: Dimension of the embedding vectors.
input_length: length of the input sequences. 


Output: 
2D vector with one embedding for each word in the input sequence of words.

**Bidirectional LSTM**: It duplicates the first recurrent layer in the network. The first layer takes the input sequence as is and the second takes the reversed input sequence. These LSTMS take the future context in conideration. In Keras it is used using the Bidirectional layer wrapper. This wrapper takes
the first recurrent layer as input and the merge mode. 

**Return Sequences**: This will return one output for each input time step and provide a 3D array. Set to true to stack LSTM layers together.

**CuDNNLSTM**: Fast LSTM implementation with CuDNN. Needs GPU. 


In [25]:
from keras.layers import CuDNNLSTM, Dense, Bidirectional
embedding_dim = 300
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=False))
model.add(Bidirectional(CuDNNLSTM(64, return_sequences=True),
                        input_shape=(30, 300)))
model.add(Bidirectional(CuDNNLSTM(64)))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer='adam',loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 300)          66648600  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 150, 128)          187392    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               99328     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 66,935,449
Trainable params: 286,849
Non-trainable params: 66,648,600
_________________________________________________________________


In [26]:
from keras.layers import CuDNNLSTM, Dense, Bidirectional
embedding_dim = 300
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=False))
model.add(Bidirectional(CuDNNLSTM(64, return_sequences=True),
                        input_shape=(30, 300)))
model.add(Bidirectional(CuDNNLSTM(64)))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer='adam',loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 150, 300)          66648600  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 150, 128)          187392    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 128)               99328     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 66,935,449
Trainable params: 286,849
Non-trainable params: 66,648,600
_________________________________________________________________


In [27]:
history2 = model.fit(x_train, y_train,
                    epochs=10,
                    verbose=2,
                    batch_size=100,
                    validation_data=(x_valid,y_valid))
loss, accuracy = model.evaluate(x_train, y_train, verbose=False)

Train on 1044897 samples, validate on 261225 samples
Epoch 1/10
 - 374s - loss: 0.1122 - accuracy: 0.9558 - val_loss: 0.1039 - val_accuracy: 0.9585
Epoch 2/10
 - 370s - loss: 0.0996 - accuracy: 0.9603 - val_loss: 0.1024 - val_accuracy: 0.9587
Epoch 3/10
 - 365s - loss: 0.0929 - accuracy: 0.9629 - val_loss: 0.1007 - val_accuracy: 0.9601
Epoch 4/10
 - 364s - loss: 0.0867 - accuracy: 0.9651 - val_loss: 0.1015 - val_accuracy: 0.9596
Epoch 5/10
 - 363s - loss: 0.0801 - accuracy: 0.9678 - val_loss: 0.1043 - val_accuracy: 0.9589
Epoch 6/10
 - 363s - loss: 0.0736 - accuracy: 0.9703 - val_loss: 0.1093 - val_accuracy: 0.9585
Epoch 7/10
 - 364s - loss: 0.0671 - accuracy: 0.9728 - val_loss: 0.1150 - val_accuracy: 0.9575
Epoch 8/10
 - 364s - loss: 0.0611 - accuracy: 0.9752 - val_loss: 0.1209 - val_accuracy: 0.9569
Epoch 9/10
 - 362s - loss: 0.0557 - accuracy: 0.9776 - val_loss: 0.1346 - val_accuracy: 0.9572
Epoch 10/10
 - 363s - loss: 0.0510 - accuracy: 0.9796 - val_loss: 0.1401 - val_accuracy: 0.9

## Model Prediction

In [28]:
prediction = model.predict_classes(X_test)
prediction[0:5]

array([[1],
       [0],
       [0],
       [0],
       [0]], dtype=int32)

In [29]:
preds  = []
for a in prediction:
    preds.append(a[0])

In [30]:
df_result = pd.DataFrame({'qid': test['qid'],'prediction': preds})

In [31]:
df_result.to_csv('submission.csv',index=False)