# Model 1 - Primarily on Text
## POS tagged, Tokenized, Bigrams, Stemmed
## LSTM 


In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pandas as pd
from tqdm import tqdm
import string

In [None]:
#Loading the dataset
data = pd.read_csv("E://Yelp//Unfiltered data//YelpZip//textonly", header = None)
print(data.shape)
data.head()

In [None]:
#Sentence Tokenization - Not applying back to the reviews at this phase - future steps
tqdm.pandas()
data[1].progress_apply(lambda txt: sent_tokenize(txt))

In [None]:
#Word Tokenization
stop_words=set(stopwords.words("english"))
print("Stopwords from the NLTK corpus: ", stop_words)

def text_process(text):
    #String punctuation provides all the necessary checks
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

tqdm.pandas()
data[1] = data[1].progress_apply(lambda txt: text_process(txt))

In [None]:
#Process took 6 hours 46 minutes - on last run - SAVE
data.to_table("E://Yelp//Unfiltered data//YelpZip//text_mod1", header = None)

In [None]:
#Adding a column of sentiment polarity - averaged for the review text - might be useful during model building
#Implemented here as it's easy to deal with the texts at this stage
from textblob import TextBlob

def sentiment_index(txt):
    sent = 0
    i = 0
    for x in txt:
        blob = TextBlob(x)
        sent += blob.sentiment.polarity
        i += 1
    return sent/i
 
data[4] = data[1].apply(lambda txt: sentiment_index(txt))

In [None]:
#Stemming
stemmer = PorterStemmer()

def stem(txt):
    words = []
    for x in txt:
        words.append(stemmer.stem(x))
    return words

data[1] = data[1].progress_apply(lambda txt: stem(txt))

In [None]:
#POS tagging
data[1] = data[1].progress_apply(lambda tokens: nltk.pos_tag(tokens))

In [None]:
#Join back the list objects and their POS tags for CountVectorizer
def join_back(txt):
    new = ''
    for x in txt:
        new += '_'.join(x)+" "
    new = new[:-1]
    return new

data[1] = data[1].progress_apply(lambda tokens: join_back(tokens))

#Save Checkpoint
data.to_table("E://Yelp//Unfiltered data//YelpZip//text_mod1", header = None)

#Document Term Matrix
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#Tokenizing and Cleaning - again - satisfaction
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (2,2),tokenizer = token.tokenize)
text_counts = cv.fit_transform(data[1])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_counts, data[2], test_size=0.2, random_state=1)

In [None]:
#Decrease size of the sparse matrix
#Feature engineering- sentiment index

In [None]:
#Need regularization
#Cross validation?
#Confusion matrix?

# Training Data on an LSTM Neural Network

In [None]:
#Determining the input shape. (Shape of dictionary)
max_words = text_counts.shape[1]

In [None]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

embd_size=32
vocab_size = max_words

model=Sequential()
model.add(Embedding(vocab_size, embd_size, input_length = max_words))
model.add(LSTM(200))
#Add dropout - Regularization
#Add ensemble of engineered features
model.add(Dense(1, activation = 'sigmoid'))

print(model.summary())

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
batch_size = 100
num_epochs = 10

X_valid, y_valid = X_train[:batch_size], y_train[:batch_size]
X_train2, y_train2 = X_train[batch_size:], y_train[batch_size:]

model.fit(X_train2, y_train2, validation_data = (X_valid, y_valid), batch_size = batch_size, epochs = num_epochs)

In [None]:
acc = model.evaluate(X_test, y_test, verbose = 0)
print('Accuracy:', acc[1])

In [None]:
#Add k fold cross validation