In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import one_hot


from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding,GlobalMaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding, Conv1D
from sklearn.metrics import confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer

import os
import zipfile

In [2]:
#NN custom functions
import Tools.NN as nn

#### Here we load our data once again. WE want to combine the data with the target and removed the unncessary column.

In [3]:
df = pd.read_csv('./body')
df_target = pd.read_csv('./target')

In [4]:
original_df= df.merge(df_target, on = 'Unnamed: 0')

In [5]:
original_df.drop('Unnamed: 0',axis =1 , inplace = True)

In [6]:
original_df.head()

Unnamed: 0,review,target
0,Looking for some authentic Japanese food at re...,1
1,Pepe Rosso is where you go when you're in SOHO...,1
2,I had waited to return a couple other times to...,1
3,This place is always busy - partly because it'...,1
4,Love this place! I am not a regular yelper I d...,0


#### We apply the cleaning function from the custom tools folder then removed the stopwords.

In [7]:
original_df['review'] = original_df['review'].apply(nn.clean_data)

In [8]:
original_df.target.value_counts()

1    256887
0     29107
Name: target, dtype: int64

In [9]:
#download nltk stopwords in necessary
#nltk.download('stopwords')

In [10]:
original_df['review'] = original_df['review'].apply(nn.remove_stop_words)

In [11]:
corpus = list(original_df['review'])

#### Here we apply the count vectorizer as well as the TFIDF to the data.

In [12]:
cv = CountVectorizer(max_features = 1000)
X = cv.fit_transform(corpus).toarray()
y = original_df['target'].values

In [13]:
#features of the count vectorizers
#cv.get_feature_names()

In [14]:
tf_transformer = TfidfTransformer()
X = tf_transformer.fit_transform(X).toarray()

In [15]:

tfidfVectorizer = TfidfVectorizer(max_features =1000)
X = tfidfVectorizer.fit_transform(corpus).toarray()

In [16]:
docs = original_df['review']
labels = original_df['target']
X_train, X_test , y_train, y_test = train_test_split(docs, labels , random_state = 42, stratify = labels)

#### We transform the data into a form to be used in neural network

In [17]:
vocab_size = 5000

X_train = [one_hot(d, vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ') for d in X_train]
X_test = [one_hot(d, vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',lower=True, split=' ') for d in X_test]

In [18]:

max_length = 100
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

#### We run a model with some randomly chosen parameters. We see that the training data scored a 94 while the test data was about 84. This shows that the data is overfit and needs to be adjusted.

In [19]:

model1 = Sequential([
    Embedding(vocab_size, 8, input_length=max_length),
   Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
  Dense(10, activation='relu'),
  Dense(1, activation='sigmoid')
])

In [20]:
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [21]:
history1 = model1.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [22]:
loss, accuracy = model1.evaluate(X_train,y_train)
print('Training Accuracy is {} '.format(accuracy*100))

Training Accuracy is 94.37609314918518 


In [23]:
loss, accuracy = model1.evaluate(X_test,y_test)
print('Testing Accuracy is {} '.format(accuracy*100))

Testing Accuracy is 85.49070358276367 


In [24]:
y_pred_train = model1.predict(X_train)

In [25]:
rounded_train = [round(x[0]) for x in y_pred_train]

In [26]:
confusion_matrix(y_train, rounded_train)

array([[ 12252,   9578],
       [  2485, 190180]])

In [27]:
y_pred = model1.predict(X_test)

In [28]:
rounded = [round(x[0]) for x in y_pred]

In [29]:
confusion_matrix(y_test, rounded)

array([[  779,  6498],
       [ 3876, 60346]])

# Sequential Model

#### Next we want to create a more complicated model and see how well it performs. While this model did perform slightly better at 89% accuracy, this model took nearly 10 times the time it took to run. Comparting this model to a tuned simplifed model, this was not worth it (see Notebook 6) 

In [30]:
X1 = original_df['review']
y1 = original_df['target']
from sklearn.model_selection import train_test_split
X_train1, X_test1 , y_train1, y_test1 = train_test_split(X1, y1 , test_size = 0.20)

In [31]:

vocab_size = 5000
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train1)

In [32]:
X_train_sequences = tokenizer.texts_to_sequences(X_train1)
X_test_sequences = tokenizer.texts_to_sequences(X_test1)

In [33]:
max_length = 100
padding_type = "post"
trunction_type="post"
X_train_padded = pad_sequences(X_train_sequences,maxlen=max_length, padding=padding_type,
                       truncating=trunction_type)
X_test_padded = pad_sequences(X_test_sequences,maxlen=max_length,
                               padding=padding_type, truncating=trunction_type)

In [34]:
with zipfile.ZipFile('./glove.6B.zip', 'r') as zip_ref:
    zip_ref.extractall('/tmp/glove')

In [35]:
embeddings_index = {}
f = open('/tmp/glove/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [36]:
embeddings_index.get("paper")

array([-0.85034  ,  0.33358  , -0.65889  , -0.49871  ,  0.36585  ,
       -0.19245  ,  0.25658  , -0.053408 ,  0.31474  ,  0.2443   ,
        0.29337  , -0.44917  ,  0.15175  ,  0.39314  , -0.31786  ,
        0.060525 ,  0.81775  , -0.38847  ,  0.76761  , -1.1041   ,
       -0.1544   ,  0.31655  , -0.37238  , -0.11485  ,  0.51635  ,
       -0.39289  ,  0.16301  , -0.2532   , -0.50976  ,  0.15201  ,
        0.27808  ,  0.52522  , -0.38815  , -0.3472   , -0.61818  ,
        0.17022  ,  0.12251  , -0.24191  , -0.38877  , -0.53176  ,
       -0.46987  , -0.70502  , -0.62126  , -0.38689  , -0.85637  ,
       -0.41003  , -0.47487  , -0.21083  , -0.81338  , -0.52398  ,
        0.49894  ,  0.37909  ,  0.55428  ,  1.123    , -0.42121  ,
       -1.5674   , -0.56892  ,  0.40819  ,  1.7949   ,  0.16856  ,
       -0.0029332,  0.28786  , -0.90088  , -0.094214 ,  0.79993  ,
       -0.39096  ,  0.76286  ,  0.71307  ,  0.13194  , -0.40756  ,
       -0.18687  ,  0.89562  ,  0.46867  , -0.0028801,  0.0253

In [37]:
embedding_matrix = np.zeros((len(X) + 1, max_length))
for word, i in X1.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [38]:
embedding_layer = Embedding(input_dim=len(X) + 1,
                            output_dim=max_length,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)


In [39]:
model = Sequential([
    embedding_layer,
  Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
  Dense(10, activation='relu'),
  Dense(1, activation='sigmoid'),
])

In [40]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [41]:
history = model.fit(X_train_padded, y_train1, epochs=20, validation_data=(X_test_padded, y_test1))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [42]:
loss, accuracy = model.evaluate(X_test_padded,y_test1)
print('Testing Accuracy is {} '.format(accuracy*100))

Testing Accuracy is 89.80051875114441 
