# Language Warmup Full Model

## Data Cleaning

In [145]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from autocorrect import spell

yelpDataset = pd.read_csv('Yelp.txt', sep='\t', header=None, encoding='latin-1')
yelpDataset.columns = ['review', 'sentiment']
stopword = nltk.corpus.stopwords.words('english')
stopword = [word for word in stopword if word != 'not']
lm = nltk.WordNetLemmatizer()

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

def onlyAlpha(tokenizedList):
    text = [word for word in tokenizedList if word.isalpha()]
    return text

def noStop(tokenizedList):
    text = [word for word in tokenizedList if word not in stopword]
    return text

def spellCheck(tokenizedList):
    text = [spell(word) for word in tokenizedList]
    return text

def lemmatize(tokenizedList):
    text = ' '.join([lm.lemmatize(word) for word in tokenizedList])
    return text

#def posTag(tokenizedList):
#    text = nltk.pos_tag(tokenizedList)
 #   return text



yelpDataset['review_tokens'] = yelpDataset['review'].apply(lambda x: tokenize(x.lower()))
yelpDataset['review_alpha'] = yelpDataset['review_tokens'].apply(lambda x: onlyAlpha(x))
yelpDataset['review_nostops'] = yelpDataset['review_alpha'].apply(lambda x: noStop(x))
yelpDataset['review_spellCheck'] = yelpDataset['review_nostops'].apply(lambda x: spellCheck(x))
yelpDataset['review_lemmatized'] = yelpDataset['review_spellCheck'].apply(lambda x: lemmatize(x))
#yelpDataset['review_posTag'] = yelpDataset['review_lemmatized'].apply(lambda x: posTag(x))

df1 = pd.DataFrame(data = yelpDataset['review_lemmatized'])
#creates a list that can be vectorized later
df1 = df1['review_lemmatized'].tolist()
df2 = pd.DataFrame(data = yelpDataset['sentiment'])
df2 = df2['sentiment'].tolist()

## Feature Engineering and Vectorization

In [146]:
# Vectorize data, with 1- and 2- grams

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True, lowercase=False)
#vectorizer = CountVectorizer(binary=True, lowercase=False, ngram_range=(1, 2))
vector = vectorizer.fit_transform(df1)

In [147]:
# Change to a numpy array

data = vector.todense()
data = np.asarray(data)
print(type(data))

<class 'numpy.ndarray'>


In [148]:
# Split into train, test, and validate sets

x_train = np.concatenate([data[:300], data[-300:]])
y_train = np.concatenate([df2[:300], df2[-300:]])
x_val = np.concatenate([data[300:400], data[600:700]])
y_val = np.concatenate([df2[300:400], df2[600:700]])
x_test = np.concatenate([data[400:600]])
y_test = np.concatenate([df2[400:600]])
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(600, 1742)
(200, 1742)
(200, 1742)


## Model Architecture

In [149]:
# Lay out the model

from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(16, activation = 'relu', input_shape = (x_train.shape[1],)))
model.add(layers.Dense(16, activation = 'relu'))
model.add(layers.Dense(1,  activation = 'sigmoid'))

In [150]:
# Compile

model.compile(optimizer = 'rmsprop', 
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [151]:
# Training time!

history = model.fit(x_train,
                       y_train, 
                       epochs=20,
                       batch_size=64,
                       validation_data=(x_val, y_val))

Train on 600 samples, validate on 200 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [152]:
# How did it do?

results = model.evaluate(x_test, y_test)
print ("Accuracy:", results[1])

Accuracy: 0.77
