In [189]:
import re
from tqdm import tqdm_notebook

from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras import regularizers, initializers, optimizers, callbacks
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from keras.models import Sequential

import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/shree/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [49]:
MAX_NB_WORDS = 100000    # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 100 # max length of each entry (sentence), including padding
VALIDATION_SPLIT = 0.2   # data for validation (not used in training)
EMBEDDING_DIM = 100      # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "./glove.6B/glove.6B."+str(EMBEDDING_DIM)+"d.txt"

In [50]:
df = pd.read_json('content_baseline.json')
df

Unnamed: 0,Sentence,Presence
0,Help me!,1
1,Help me!,1
2,"""Front desk clerk Shawna Vela said she dialed ...",0
3,"I've been shot,' "" said Rosalinda Gonzalez, an...",0
4,Mid-Market and the Tenderloin are home to a th...,1
...,...,...
20181,"I'd flip open a cell phone, turn on its camera...",1
20182,.,0
20183,)The biggest challenge in documenting my dinin...,0
20184,"Ahumdinger TV season wrapped Wednesday night, ...",1


In [51]:
X = df['Sentence']
y = df['Presence'].values

X = list(X)
# for line in tqdm_notebook(X):
#     print(clean_text(line))


In [52]:
#Data Cleaning

def clean_text(text, remove_stopwords = True):
    output = ""
    text = str(text).replace("\n", "")
    text = re.sub(r'[^\w\s]','',text).lower()
    if remove_stopwords:
        text = text.split(" ")
        for word in text:
            if word not in stopwords.words("english"):
                output = output + " " + word
    else:
        output = text
    return str(output.strip())[1:-3].replace("  ", " ")



In [53]:
texts = [] 

for line in tqdm_notebook(X): 
    texts.append(clean_text(line))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=20186.0), HTML(value='')))




In [54]:
print('Sample data:', texts[5], y[5])

Sample data: ortheast corner market pretty quiet farther block seventh sixth theres another larger group young gentlemen usua 1


In [55]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

In [56]:
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Vocabulary size:', len(word_index))

Vocabulary size: 49540


In [57]:
data = pad_sequences(sequences, padding = 'post', maxlen = MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

Shape of data tensor: (20186, 100)
Shape of label tensor: (20186,)


In [58]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = y[indices]

In [59]:
num_validation_samples = int(VALIDATION_SPLIT*data.shape[0])
x_train = data[: -num_validation_samples]
y_train = labels[: -num_validation_samples]
x_val = data[-num_validation_samples: ]
y_val = labels[-num_validation_samples: ]

In [60]:
# print('Number of entries in each category:')
# print('training: ', y_train.sum(axis=0))
# print('validation: ', y_val.sum(axis=0))

In [194]:
print('Tokenized sentences: \n', data[5])
print('One hot label: \n', labels[5])

Tokenized sentences: 
 [1542 2219 2544 2255    7    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
One hot label: 
 1


In [62]:
embeddings_index = {}
f = open(GLOVE_DIR)
print('Loading GloVe from:', GLOVE_DIR,'...', end='')
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()
print("Done.\n Proceeding with Embedding Matrix...", end="")

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print(" Completed!")

Loading GloVe from: ./glove.6B/glove.6B.100d.txt ...Done.
 Proceeding with Embedding Matrix... Completed!


In [290]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(word_index) + 1,
                           EMBEDDING_DIM,
                           weights = [embedding_matrix],
                           input_length = MAX_SEQUENCE_LENGTH,
                           trainable=False,
                           name = 'embeddings')
embedded_sequences = embedding_layer(sequence_input)

In [291]:
# tf.keras.layers.Bidirectional(
#     layer, merge_mode="concat", weights=None, backward_layer=None, **kwargs
# )

# fwd_layer = LSTM(100, return_sequences=True, name='lstm_layer')(embedded_sequences)
# bkwd_layer = LSTM(100, return_sequences=True, go_backwards=True)

# x = Bidirectional(fwd_layer, bkwd_layer)
x = LSTM(100, return_sequences=True, name='lstm_layer')(embedded_sequences)
# x = GlobalMaxPool1D()(x)
# x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
# x = Dropout(0.2)(x)
preds = Dense(1, activation="sigmoid")(x)

In [292]:
model = Model(sequence_input, preds)
model.compile(loss = 'binary_crossentropy',
             optimizer='adam',
             metrics = ['accuracy'])
model.summary()

Model: "model_31"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        [(None, 100)]             0         
_________________________________________________________________
embeddings (Embedding)       (None, 100, 100)          4954100   
_________________________________________________________________
lstm_layer (LSTM)            (None, 100, 100)          80400     
_________________________________________________________________
dense_66 (Dense)             (None, 100, 50)           5050      
_________________________________________________________________
dense_67 (Dense)             (None, 100, 1)            51        
Total params: 5,039,601
Trainable params: 85,501
Non-trainable params: 4,954,100
_________________________________________________________________


In [293]:
print('Training progress:')
history = model.fit(x_train, y_train, epochs = 10)

Training progress:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [295]:
pred = model.predict(x_val)
pred

array([[[0.4189127 ],
        [0.32357314],
        [0.41888142],
        ...,
        [0.3465091 ],
        [0.34562534],
        [0.34474844]],

       [[0.43036714],
        [0.40007567],
        [0.5015754 ],
        ...,
        [0.5400595 ],
        [0.53955805],
        [0.5390617 ]],

       [[0.52836776],
        [0.61595356],
        [0.76090324],
        ...,
        [0.9630629 ],
        [0.96305895],
        [0.963055  ]],

       ...,

       [[0.44722772],
        [0.63716537],
        [0.5284948 ],
        ...,
        [0.79211164],
        [0.79180765],
        [0.79150546]],

       [[0.54270905],
        [0.5237188 ],
        [0.6275068 ],
        ...,
        [0.9682194 ],
        [0.9682057 ],
        [0.968192  ]],

       [[0.4436888 ],
        [0.43925738],
        [0.33890706],
        ...,
        [0.5064143 ],
        [0.50569   ],
        [0.50497305]]], dtype=float32)

In [301]:
pred_list = []      
for sublist in pred:
    for item in sublist:
        pred_list.append(int(item))

for i in range(len(pred_list)):
    if pred_list[i] < 0.5:
        pred_list[i] = 0
    else:
        pred_list[i] = 1

len(pred_list)
        
# for i in pred:
#     for j in i:
#         if j < 0.5:
#             j = 0
#         else:
#             j = 1
#         pred_list.append(int(j))

403700

In [297]:
pred_list = np.array(pred_list)

from sklearn import metrics
print(metrics.accuracy_score(y_val, pred_list))

ValueError: Found input variables with inconsistent numbers of samples: [4037, 403700]

In [None]:
Normal

epochs: 50, 
score:  63.76 
    
epoch: 40,
score: 64.10
    
epoch: 30
score: 65.04
    
epoch: 20
score: 63.76
    
epoch: 10
score: 65.27
    
===========================================
Global MaxPool Layer

epochs: 50, 
score:  63.85
    
epoch: 40,
score: 64.03
    
epoch: 30
score: 63.16
    
epoch: 20
score: 63.83
    
epoch: 10
score: 64.13

===========================================
Global MaxPool Layer with 2 dropout layers with parameter = 0.1

epochs: 50, 
score: 64.87
    
epoch: 40,
score: 63.78
    
epoch: 30
score: 66.53
    
epoch: 20
score: 64.47
    
epoch: 10
score: 64.80
    
============================================
Global MaxPool Layer with 2 dropout layers with parameter = 0.2

epochs: 50, 
score: 64.55
    
epoch: 40, 
score: 64.89
    
epoch: 30
score: 64.20
    
epoch: 20
score: 64.40
    
epoch: 10
score: 65.02
