# Language Warmup Final Model

## Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import string


yelpDataset = pd.read_csv('Yelp.txt', sep='\t', header=None, encoding='latin-1')
yelpDataset.columns = ['review', 'sentiment']

print(yelpDataset[:5])

def removePunct(text):
    noPunct = ''.join([char for char in text if char not in string.punctuation])
    return noPunct

yelpDataset['review_clean'] = yelpDataset['review'].apply(lambda x: removePunct(x.lower()))

df1 = pd.DataFrame(data = yelpDataset['review_clean'])
#numpy array for review 
df1 = df1.values
df2 = pd.DataFrame(data = yelpDataset['sentiment'])
#numpy array for sentiment 
df2 = df2.values

                                              review  sentiment
0                           Wow... Loved this place.          1
1                                 Crust is not good.          0
2          Not tasty and the texture was just nasty.          0
3  Stopped by during the late May bank holiday of...          1
4  The selection on the menu was great and so wer...          1


In [2]:
# Right now df1 is a 2D array.  (Each phrase is itself a single element, 1D array (Print it out and see the two sets of brackets))
# Need to collapse that

# Quick and dirty
phrases = []
for phrase in df1:
    phrases.append(phrase[0])
    
# And same for df1
y_dat = []
for dat in df2:
    y_dat.append(dat[0])
    
print(phrases[:5])
print(y_dat[:5])

['wow loved this place', 'crust is not good', 'not tasty and the texture was just nasty', 'stopped by during the late may bank holiday off rick steve recommendation and loved it', 'the selection on the menu was great and so were the prices']
[1, 0, 0, 1, 1]


## Feature Engineering

In [3]:
## This uses fake data
'''
import random

with open("FeatureCreate\words.txt") as f:
    words = f.readlines()
words = [x.strip() for x in words]

phrases = []
for i in range (0,10):
    phraselength = random.randint(5,15)
    phrase = []
    for j in range(0,phraselength):
        choice = random.randint(0,len(words))
        phrase.append(words[choice])
        sentence = ' '.join(phrase)
    phrases.append(sentence)
'''
# REMOVE ABOVE

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True, lowercase=False, ngram_range=(2, 2))
vector = vectorizer.fit_transform(phrases)

In [4]:
# Change to a numpy array

data = vector.todense()
data = np.asarray(data)
print(type(data))


<class 'numpy.ndarray'>


In [5]:
# Split into train, test, and validate sets

x_train = np.concatenate([data[:300], data[-300:]])
y_train = np.concatenate([y_dat[:300], y_dat[-300:]])
x_val = np.concatenate([data[300:400], data[600:700]])
y_val = np.concatenate([y_dat[300:400], y_dat[600:700]])
x_test = np.concatenate([data[400:600]])
y_test = np.concatenate([y_dat[400:600]])
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(600, 6687)
(200, 6687)
(200, 6687)


# Model Architecture

In [6]:
# Lay out the model

from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(16, activation = 'relu', input_shape = (x_train.shape[1],)))
model.add(layers.Dense(16, activation = 'relu'))
model.add(layers.Dense(1,  activation = 'sigmoid'))

Using TensorFlow backend.


In [7]:
# Compile

model.compile(optimizer = 'rmsprop', 
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [8]:

# Training time!

history = model.fit(x_train,
                       y_train, 
                       epochs=20,
                       batch_size=64,
                       validation_data=(x_val, y_val))

Train on 600 samples, validate on 200 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [9]:

# How did it do?

results = model.evaluate(x_test, y_test)
print ("Accuracy:", results[1])

Accuracy: 0.71
