In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model
#|

In [7]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
questions = pd.read_csv("train.csv")

In [9]:
pd.set_option('display.max_colwidth', None)
questions.head(10)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province as a nation in the 1960s?,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you encourage people to adopt and not shop?",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity affect space geometry?,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg hemispheres?,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain bike by just changing the tyres?,0
5,00004f9a462a357c33be,"Is Gaza slowly becoming Auschwitz, Dachau or Treblinka for Palestinians?",0
6,00005059a06ee19e11ad,"Why does Quora automatically ban conservative opinions when reported, but does not do the same for liberal views?",0
7,0000559f875832745e2e,Is it crazy if I wash or wipe my groceries off? Germs are everywhere.,0
8,00005bd3426b2d0c8305,"Is there such a thing as dressing moderately, and if so, how is that different than dressing modestly?",0
9,00006e6928c5df60eacb,"Is it just me or have you ever been in this phase wherein you became ignorant to the people you once loved, completely disregarding their feelings/lives so you get to have something go your way and feel temporarily at ease. How did things change?",0


In [11]:
questions=questions.drop(['qid'],axis=1)
questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306122 entries, 0 to 1306121
Data columns (total 2 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   question_text  1306122 non-null  object
 1   target         1306122 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 19.9+ MB


In [12]:
train, test = train_test_split(questions, test_size=0.2)

In [13]:
'''Perhaps there are some outliers'''
train_lens = []
for line in train['question_text']:
    train_lens.append(len(word_tokenize(line)))
np.quantile(train_lens, 0.95)

31.0

In [14]:
train['length'] = train_lens


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['length'] = train_lens


In [15]:
spam = train.loc[train['target']==1]
not_spam = train.loc[train['target']==0]

In [16]:
n = spam.shape[0]
balanced_trainset = pd.concat([spam, not_spam.sample(2*n)])
balanced_trainset.shape

(194052, 3)

In [17]:
temp = train.loc[(train['target']==1) & (train['length']>31)]
print(temp.shape[0]/n) # 14% percent of the spams have length above 31, can't afford to remove them from what is already so less


0.14543009090346917


In [18]:
'''Now to drop the useless length column, tokenize the whole question_text column and convert it to integer sequences'''
balanced_trainset = balanced_trainset.drop(['length'],axis=1)
tok = Tokenizer(char_level=False, split=' ')

In [19]:
tok.fit_on_texts(balanced_trainset['question_text'])
tok.index_word

{1: 'the',
 2: 'to',
 3: 'is',
 4: 'a',
 5: 'in',
 6: 'what',
 7: 'of',
 8: 'and',
 9: 'do',
 10: 'why',
 11: 'are',
 12: 'i',
 13: 'how',
 14: 'for',
 15: 'you',
 16: 'it',
 17: 'can',
 18: 'that',
 19: 'with',
 20: 'have',
 21: 'if',
 22: 'on',
 23: 'my',
 24: 'be',
 25: 'or',
 26: 'people',
 27: 'does',
 28: 'they',
 29: 'when',
 30: 'as',
 31: 'from',
 32: 'so',
 33: 'not',
 34: 'an',
 35: 'their',
 36: 'like',
 37: 'should',
 38: 'would',
 39: 'who',
 40: 'get',
 41: 'your',
 42: 'will',
 43: 'about',
 44: 'there',
 45: 'by',
 46: 'which',
 47: 'best',
 48: 'some',
 49: 'at',
 50: 'did',
 51: 'was',
 52: 'we',
 53: 'all',
 54: 'any',
 55: 'india',
 56: 'this',
 57: 'me',
 58: 'more',
 59: 'has',
 60: 'trump',
 61: 'think',
 62: 'women',
 63: 'but',
 64: 'than',
 65: 'good',
 66: 'many',
 67: 'most',
 68: 'one',
 69: 'them',
 70: 'he',
 71: 'other',
 72: 'make',
 73: 'after',
 74: 'quora',
 75: "don't",
 76: 'being',
 77: 'much',
 78: 'his',
 79: 'just',
 80: 'men',
 81: 'us',
 82:

In [21]:
word_limit = int(np.quantile(train_lens, 0.99))
max_len = word_limit


In [22]:
x_train = tok.texts_to_sequences(balanced_trainset['question_text'])
x_train_matrix = sequence.pad_sequences(x_train, word_limit)
x_train_matrix.shape

(194052, 45)

In [23]:
x_train_matrix

array([[   0,    0,    0, ...,   19,    4,  729],
       [   0,    0,    0, ...,  112,  129,  177],
       [   0,    0,    0, ...,   11, 1006, 8900],
       ...,
       [   0,    0,    0, ...,   99,    7,   97],
       [   0,    0,    0, ...,  189,    1,  847],
       [   0,    0,    0, ...,   25,    4, 3080]])

In [83]:
len(x_train_matrix[10000])==word_limit #So all ok!

True

In [24]:
vocab_size = len(tok.index_word)
max_len = word_limit

In [49]:
def RNN():
    inputs = Input(name='inputs', shape=[max_len])
    
    # This layer can only be used as the first layer in a model.
    # Turns positive integers (index values) into dense vectors of fixed size.
    # The model will take as input an integer matrix of size (batch, input_length) and the 
    # largest integer (i.e. word index) in the input should be no larger than vocabulary_size+1.  
    # Now model's output_shape is (None, max_len, output_dim), where `None` is the batch dimension. 
    layer = Embedding(input_dim = vocab_size+1, output_dim = 500, input_length = max_len, mask_zero=True)(inputs)

    # num_params = input_dim * output_dim = 2689 * 500 = 1344500  

    layer = LSTM(64)(layer)   # num_params = [(num_units + input_dim + 1) * num_units] * 4
                              # 144640 = [(64 + 500 +1) * 64] *4  

    layer = Dense(256, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    
    layer = Dense(1, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

In [36]:
balanced_trainset.to_csv("Balanced_set.csv", index=False) 
'''To resume work at this place. I had to re install numpy because the latest np version was a bit wonky with tf'''

'To resume work at this place. I had to re install numpy because the latest np version was a bit wonky with tf'

In [50]:
model = RNN()
model.summary()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 45)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 45, 500)           38955500  
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                144640    
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_4 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                

In [51]:
 model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
'''Now prep the test data similarly'''
x_test = tok.texts_to_sequences(test['question_text'])
x_test_matrix = sequence.pad_sequences(x_test, word_limit)
x_test_matrix.shape

(261225, 45)

In [52]:
'''Finally, I can fit the model and test it'''
#Very massive data set, arbitrarily chosen batch
model.fit(x_train_matrix, balanced_trainset['target'].values, batch_size=10000,epochs=10, 
          validation_data = (x_test_matrix, test['target'].values))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1e31f07a940>

In [53]:
'''Testing Testing 1,2,3'''
predictions = model.predict(x_test_matrix)
y_test = test['target'].values


In [80]:
from sklearn.metrics import roc_auc_score
score = {'roc':roc_auc_score(y_test,predictions)}

In [55]:
model.save('QuoraSpamFilter')
model.save_weights('QuoraSpamFilterWeights.h5')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: QuoraSpamFilter\assets


In [72]:
a = np.ravel(predictions)
temp = a>0.5
pd.value_counts(temp)

(261225, 1)

In [83]:
total = len(temp)
correct = 0
for i in range(total):
    if temp[i]==y_test[i]:
        correct += 1
accuracy = correct/total
score['accuracy'] = accuracy
score

{'roc': 0.9340562868238005, 'accuracy': 0.8945391903531439}

False    222576
True      38649
dtype: int64