# The Final Neural Network

In [4]:
import pandas as pd
import string
import nltk
import numpy as np
import keras
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

In [None]:
data = pd.read_csv(r"E:\Yelp\Unfiltered Data\YelpZip\Customs\textonly", header = None)
data.head()

In [None]:
print(data.shape)
print(data.groupby(5).count())

In [6]:
data.columns = ['index', 'user', 'item', 'review', 'rating', 'real', 'date']

## The first step is to remove stop words, but considering that we would be using tf-idf on our sparse matrix, we need not specifically remove the stop words as tf-idf would ensure that stop words don't get any more weightage. We will also be doing selective drops on the matrix. 
## This is to avoid the 6-7 hour hassle of processing time for removing stop words on a dataset of this size.


In [None]:
#Removing punctuations - Example
test = "This is a test-sentence to remove stop-word's!!! . . ."
trans = str.maketrans('', '', string.punctuation)
words = test.split()
print(words)
print(string.punctuation)

stripped = [w.translate(trans) for w in words]
print(stripped)
#We will also drop the empty cells
# for word in stripped:
#     if word == "":
#         stripped.remove(word)
a = [x for x in stripped if x != ""]
print(a)
b = [word.lower() for word in a]
print(b)
b = ' '.join(b)
print(b)

In [None]:
def clean_text(text):
    trans = str.maketrans('', '', string.punctuation)
    words = text.split()
    stripped = [word.translate(trans) for word in words]
    a = [x for x in stripped if x!=""]
    b = [word.lower() for word in a]
    b = ' '.join(b)
    return b

data['review'] = data['review'].progress_apply(lambda text: clean_text(text))

In [None]:
#Save checkpoint
data.to_csv(r"E:\Yelp\Unfiltered Data\YelpZip\Customs\checkpoint", header=True, index=False)

#Use if kernel crashes
#data = pd.read_csv(r"E:\Yelp\Unfiltered Data\YelpZip\Customs\checkpoint")

In [None]:
#Stemming/Lemmatization
lemmatizer = WordNetLemmatizer()
#print(lemmatizer.lemmatize("going"))

def nltk2wn_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    wn_tagged = map(lambda x: (x[0], nltk2wn_tag(x[1])), nltk_tagged)

    res_words = []
    for word, tag in wn_tagged:
        if tag is None:            
            res_words.append(word)
        else:
            res_words.append(lemmatizer.lemmatize(word, tag))
    return " ".join(res_words)

# test =  "You better lose yourself in the music, the moment You own it, you better never let it go You only get one shot, do not miss your chance to blow This opportunity comes once in a lifetime"
# print(lemmatize_sentence(test))

data['review'] = data['review'].progress_apply(lambda text: lemmatize_sentence(text))

In [5]:
#Save checkpoint
# data.to_csv(r"E:\Yelp\Unfiltered Data\YelpZip\Customs\checkpoint", header=True, index=False)

#Use if kernel crashes
data = pd.read_csv(r"E:\Yelp\Unfiltered Data\YelpZip\Customs\checkpoint")

## Not adding POS tags to the documents cause we used POS tags to transform each world to it's lemma

In [None]:
vectorizer = TfidfVectorizer(max_df = 0.9, min_df = 20, ngram_range = (1,1), max_features = 20000)
vectorizer.fit(data['review'].astype(str))
vector = vectorizer.transform(data['review'].astype(str))
print(vector.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vector, data['real'], test_size=0.20, random_state=42)

In [None]:
model = Sequential()
model.add(keras.layers.Embedding(20000, 32, input_length=vector.shape[1]))
model.add(keras.layers.LSTM(500, dropout=0.2, recurrent_dropout=0.2, activation='sigmoid'))
# model.add(keras.layers.LSTM(512, dropout=0.2, recurrent_dropout=0.2, activation='sigmoid'))
# model.add(keras.layers.Dense(64, activation='sigmoid'))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
model.fit(X_train, y_train, validation_split=0.20, batch_size = 10, epochs=2)

In [None]:
from keras.models import load_model
model.save(r"E:\Yelp\Unfiltered Data\YelpZip\Customs\checkpoint\my_model.h5")  # creates a HDF5 file 'my_model.h5'

In [None]:
scores = model.evaluate(X_test, y_test)

In [None]:
print("Accuracy: ", scores[1]*100, "%")

In [None]:
y_pred = model.predict_classes(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
data['review'].head()

In [None]:
#Other option - Using Keras text preprocessing - cause the one above wasn't working

In [7]:
def binarize(x):
    if x == 1:
        return 1
    else:
        return 0
    
data['real'] = data['real'].progress_apply(lambda x: binarize(x))

HBox(children=(IntProgress(value=0, max=608458), HTML(value='')))




In [8]:
tokenizer = Tokenizer(num_words= 20000)
tokenizer.fit_on_texts(data['review'].astype(str))

sequences = tokenizer.texts_to_sequences(data['review'].astype(str))
data1 = pad_sequences(sequences, maxlen=150)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data1, data['real'], test_size=0.20, random_state=42)

In [10]:
model = Sequential()
model.add(Embedding(20000, 64, input_length=150))
model.add(LSTM(1024, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(32, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 64)           1280000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1024)              4460544   
_________________________________________________________________
dense_1 (Dense)              (None, 32)                32800     
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 5,773,377
Trainable params: 5,773,377
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train, y_train, validation_split=0.20, batch_size = 1000, epochs=2)

Train on 389412 samples, validate on 97354 samples
Epoch 1/2


In [None]:
from keras.models import load_model
model.save(r"E:\Yelp\Unfiltered Data\YelpZip\Customs\checkpoint\my_model.h5")  # creates a HDF5 file 'my_model.h5'

In [None]:
scores = model.evaluate(X_test, y_test)

In [None]:
print("Accuracy: ", scores[1]*100, "%")

In [None]:
y_pred = model.predict_classes(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))