In [1]:
from collections import Counter
from datetime import datetime

import json

from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import numpy as np

Using TensorFlow backend.


In [2]:
# Load the reviews and parse JSON
t1 = datetime.now()
with open("chunk00000.json", encoding='utf-8') as f:
    reviews = f.read().strip().split("\n")
reviews = [json.loads(review) for review in reviews]
print(datetime.now() - t1)

0:00:09.517536


In [3]:
# Get a balanced sample of positive and negative reviews
texts = [review['text'] for review in reviews]

# Convert our 5 classes into 2 (negative or positive)
binstars = [0 if review['stars'] <= 3 else 1 for review in reviews]
balanced_texts = []
balanced_labels = []
limit = 100000  # Change this to grow/shrink the dataset
neg_pos_counts = [0, 0]
for i in range(len(texts)):
    polarity = binstars[i]
    if neg_pos_counts[polarity] < limit:
        balanced_texts.append(texts[i])
        balanced_labels.append(binstars[i])
        neg_pos_counts[polarity] += 1

In [4]:
Counter(balanced_labels)
# >>> Counter({0: 100000, 1: 100000})

Counter({0: 100000, 1: 100000})

In [5]:
tokenizer = Tokenizer(num_words=5)
toytexts = ["Is is a common word", "So is the", "the is common", "discombobulation is not common"]
tokenizer.fit_on_texts(toytexts)
sequences = tokenizer.texts_to_sequences(toytexts)


# >>> [[1, 1, 4, 2], [1, 3], [3, 1, 2], [1, 2]]

In [6]:
print(sequences)
# >>> [[1, 1, 4, 2], [1, 3], [3, 1, 2], [1, 2]]

[[1, 1, 4, 2], [1, 3], [3, 1, 2], [1, 2]]


In [7]:
print(tokenizer.word_index)

{'discombobulation': 7, 'so': 6, 'word': 5, 'the': 3, 'common': 2, 'not': 8, 'is': 1, 'a': 4}


In [8]:
padded_sequences = pad_sequences(sequences)

print(padded_sequences)

[[1 1 4 2]
 [0 0 1 3]
 [0 3 1 2]
 [0 0 1 2]]


In [9]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(balanced_texts)
sequences = tokenizer.texts_to_sequences(balanced_texts)
data = pad_sequences(sequences, maxlen=300)

In [None]:
model = Sequential()
model.add(Embedding(20000, 128, input_length=300))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(data, np.array(balanced_labels), validation_split=0.5, epochs=3)

In [None]:
model.fit(data, np.array(balanced_labels), validation_split=0.5, epochs=3)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

t1 = datetime.now()
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=3)
classifier = LinearSVC()
Xs = vectorizer.fit_transform(balanced_texts)

print(datetime.now() - t1)
print(Xs.shape)

score = cross_val_score(classifier, Xs, balanced_labels, cv=2, n_jobs=-1)

print(datetime.now() - t1)
print(score)
print(sum(score) / len(score))

In [None]:
import pickle

# save the tokenizer and model
with open("keras_tokenizer.pickle", "wb") as f:
   pickle.dump(tokenizer, f)
model.save("yelp_sentiment_model.hdf5")