### 1. Read data
Read the data from CSV and apply some basic pre-processing (remove non-ascii characters, convert our target variable to an integer label).

In [1]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
from imblearn.under_sampling import RandomUnderSampler

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
path = r"C:\Users\syed.nusrath\Downloads\DATA\Edvancer\project 3 Quora spam identifier\train.csv"

In [3]:
df = pd.read_csv(path)

In [4]:
df.shape

(1306122, 3)

In [5]:
df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [7]:
docs = df["question_text"].values
labels = df["target"].values

In [8]:
np.unique(labels,return_counts=True)

(array([0, 1], dtype=int64), array([1225312,   80810], dtype=int64))

### 2. Preprocessing
Tokenize text, convert words / tokens to indexed integers. Take each document and convert to a sequence of max length 20 (pad with zeroes if shorter).

In [9]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(vocab_size)

222162


In [11]:
t.word_index

{'the': 1,
 'what': 2,
 'is': 3,
 'a': 4,
 'to': 5,
 'in': 6,
 'of': 7,
 'i': 8,
 'how': 9,
 'and': 10,
 'do': 11,
 'are': 12,
 'for': 13,
 'you': 14,
 'can': 15,
 'why': 16,
 'it': 17,
 'my': 18,
 'that': 19,
 'if': 20,
 'with': 21,
 'on': 22,
 'or': 23,
 'have': 24,
 'be': 25,
 'does': 26,
 'from': 27,
 'your': 28,
 'an': 29,
 'which': 30,
 'should': 31,
 'when': 32,
 'get': 33,
 'best': 34,
 'would': 35,
 'as': 36,
 'people': 37,
 'some': 38,
 'there': 39,
 'who': 40,
 'will': 41,
 'like': 42,
 'at': 43,
 'not': 44,
 'about': 45,
 'by': 46,
 'they': 47,
 'did': 48,
 'was': 49,
 'any': 50,
 'we': 51,
 'so': 52,
 'good': 53,
 'me': 54,
 'their': 55,
 'one': 56,
 'india': 57,
 'has': 58,
 'after': 59,
 'most': 60,
 'where': 61,
 'make': 62,
 'this': 63,
 'but': 64,
 'more': 65,
 'all': 66,
 'think': 67,
 'many': 68,
 'between': 69,
 'time': 70,
 'than': 71,
 'much': 72,
 'other': 73,
 'life': 74,
 'someone': 75,
 'use': 76,
 'he': 77,
 'out': 78,
 'way': 79,
 'am': 80,
 'know': 81,
 'u

In [10]:
encoded_docs

[[9, 48, 6683, 7219, 158, 55, 6107, 36, 4, 1206, 6, 1, 8333],
 [11, 14, 24, 29, 3864, 498, 9, 35, 14, 3672, 37, 5, 3089, 10, 44, 1846],
 [16, 26, 2002, 374, 70, 26, 2002, 374, 451, 5546],
 [9, 48, 13005, 8284, 52192, 119, 1, 39877, 28269],
 [15, 8, 1130, 42987, 99430, 911, 5, 4, 3133, 1533, 46, 96, 1465, 1, 9340],
 [3, 9110, 3898, 765, 19163, 52193, 23, 99431, 13, 2649],
 [16,
  26,
  104,
  2559,
  1642,
  1455,
  2440,
  32,
  4295,
  64,
  26,
  44,
  11,
  1,
  142,
  13,
  1165,
  897],
 [3, 17, 1960, 20, 8, 2955, 23, 5829, 18, 12856, 225, 12373, 12, 3684],
 [3,
  39,
  205,
  4,
  187,
  36,
  6552,
  19572,
  10,
  20,
  52,
  9,
  3,
  19,
  130,
  71,
  6552,
  25461],
 [3,
  17,
  96,
  54,
  23,
  24,
  14,
  89,
  114,
  6,
  63,
  2182,
  15421,
  14,
  1375,
  3148,
  5,
  1,
  37,
  14,
  654,
  1712,
  901,
  18808,
  55,
  1088,
  1045,
  52,
  14,
  33,
  5,
  24,
  194,
  106,
  28,
  79,
  10,
  98,
  6911,
  43,
  6580,
  9,
  48,
  146,
  166],
 [2, 15, 14, 156, 4

In [15]:
# text_lens=[]
# for title in docs:
#     text_lens.append(len(word_tokenize(title)))

In [14]:
# max(text_lens)

412

In [17]:
# np.quantile(text_lens,0.85)

21.0

In [13]:
# pad documents to a max length of 4 words
max_length = 20
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(len(padded_docs))

1306122


In [16]:
padded_docs.shape

(1306122, 20)

In [17]:
rus = RandomUnderSampler(random_state=42)
padded_docs_rus,labels_rus = rus.fit_resample(padded_docs,labels)

In [18]:
padded_docs_rus.shape,labels_rus.shape,padded_docs.shape,labels.shape

((161620, 20), (161620,), (1306122, 20), (1306122,))

In [19]:
np.unique(labels_rus,return_counts=True)

(array([0, 1], dtype=int64), array([80810, 80810], dtype=int64))

### 3. Import embeddings
The clever part: import a dictionary of word embeddings that translates each word into a 100 dimensional vector.

In [20]:
# load the whole embedding into memory
EMBEDDING_FILE = r"C:\Users\syed.nusrath\Downloads\DATA\glove.840B.300d.txt"
EMBEDDING_DIM = 100

def get_embedding():
    embeddings_index = {}
    f = open(EMBEDDING_FILE,'r', errors = 'ignore', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
        except:
            pass
            
    f.close()
    return embeddings_index

embeddings_index = get_embedding()

In [21]:
# create a weight matrix for words in training docs

def create_embedding_weights(vocab_size,t):
    embedding_matrix = np.zeros((vocab_size, 300))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    print(embedding_matrix.shape)
    return embedding_matrix



In [23]:
embedding_matrix = create_embedding_weights(vocab_size,t)

(222162, 300)


### 4. Network architecture

##### Simple model

In [33]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=20, trainable=False))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

##### LSTM model

In [29]:
## create model
model_glove = Sequential()
model_glove.add(Embedding(vocab_size, 300, input_length=20, weights=[embedding_matrix], trainable=False))
model_glove.add(Dropout(0.2))
model_glove.add(Conv1D(64, 5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=4))
model_glove.add(LSTM(100))
model_glove.add(Dense(1, activation='sigmoid'))
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [30]:
model_glove.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 300)           66648600  
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 300)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 16, 64)            96064     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 4, 64)             0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 66,810,765
Trainable params: 162,165
Non-trainable params: 66,648,600
__________________________________________________________

### 5. Training and Evaluation
Is it any good? Let's find out.
Divide our dataset using a holdout strategy:

In [31]:
# split dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs_rus, labels_rus, test_size=0.2, random_state=42)

In [32]:
# fit the model
model_glove.fit(X_train, y_train, epochs=5, verbose=0)

<keras.callbacks.History at 0x12f0a36b550>

In [35]:
# evaluate the model
loss, accuracy = model_glove.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 88.754486


In [37]:
model_glove.predict(X_test)

array([[0.85577065],
       [0.9414769 ],
       [0.01048732],
       ...,
       [0.2845876 ],
       [0.8001421 ],
       [0.98263633]], dtype=float32)

In [39]:
ques = ['Has the United States become the largest dictatorship in the world?','How should I prepare for IIT K/IIM C/ ISI K PGDBA course exam and interview?']

In [40]:
t.fit_on_texts(ques)
# integer encode the documents
encoded_ques = t.texts_to_sequences(ques)
max_length = 20
padded_ques = pad_sequences(encoded_ques, maxlen=max_length, padding='post')
print(len(padded_ques))

2


In [41]:
model_glove.predict(padded_ques)

array([[0.7563303 ],
       [0.00217064]], dtype=float32)