In [69]:
from collections import Counter
from datetime import datetime
 
import json
 
from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
 
import numpy as np

In [70]:
t1 = datetime.now()
with open("yelp_data/dataset/review.json") as f:
    reviews = f.read().strip().split("\n")
reviews = [json.loads(review) for review in reviews]
print(datetime.now() - t1)

0:01:08.040517


In [71]:
print(len(reviews))

5261669


In [105]:
texts = [review['text'] for review in reviews]
 
# Convert our 5 classes into 2 (negative or positive)
numStars = [review['stars'] for review in reviews]
balanced_texts = []
balanced_labels = []
limit = 20000  # Change this to grow/shrink the dataset
pos_counts = [0, 0, 0, 0, 0]
for i in range(len(texts)):
    score = numStars[i]-1
    if pos_counts[score] < limit:
        balanced_texts.append(texts[i])
        balanced_labels.append(numStars[i]-1)
        pos_counts[score] += 1

In [106]:
print(balanced_labels)

[4, 4, 4, 4, 3, 3, 4, 3, 3, 2, 4, 3, 2, 0, 2, 4, 3, 0, 2, 2, 0, 2, 3, 2, 0, 2, 2, 1, 4, 2, 3, 3, 0, 3, 3, 3, 4, 3, 3, 3, 3, 1, 3, 3, 0, 0, 3, 3, 3, 3, 3, 2, 3, 3, 1, 2, 3, 3, 3, 1, 3, 0, 0, 2, 2, 1, 4, 3, 3, 2, 2, 2, 3, 3, 0, 3, 1, 2, 3, 2, 2, 3, 0, 3, 3, 3, 3, 4, 0, 3, 3, 3, 4, 4, 0, 0, 0, 0, 3, 0, 1, 3, 3, 2, 1, 0, 1, 2, 2, 0, 1, 3, 1, 2, 1, 3, 3, 1, 0, 4, 3, 3, 4, 0, 4, 3, 0, 1, 3, 3, 1, 2, 2, 4, 3, 3, 2, 0, 3, 3, 2, 2, 3, 3, 0, 1, 3, 4, 3, 1, 3, 3, 2, 3, 2, 1, 2, 3, 1, 1, 3, 2, 1, 2, 1, 2, 3, 1, 4, 2, 3, 1, 0, 3, 3, 4, 3, 3, 4, 4, 4, 4, 2, 3, 3, 1, 3, 3, 4, 3, 2, 3, 2, 4, 3, 1, 1, 4, 3, 0, 3, 3, 2, 4, 2, 4, 2, 3, 0, 4, 1, 1, 4, 3, 4, 1, 3, 3, 3, 2, 3, 3, 2, 4, 2, 0, 4, 3, 2, 1, 4, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 1, 2, 1, 2, 3, 3, 3, 4, 3, 3, 3, 3, 3, 2, 3, 2, 2, 4, 1, 1, 3, 2, 4, 4, 2, 3, 4, 3, 3, 2, 3, 3, 2, 2, 2, 2, 4, 4, 4, 2, 2, 4, 2, 3, 2, 4, 3, 3, 3, 3, 2, 2, 0, 2, 3, 2, 2, 4, 0, 3, 1, 2, 4, 4, 2, 0, 2, 2, 2, 3, 1, 1, 3, 2, 4, 1, 2, 2, 1, 3, 2, 1, 2, 3, 3, 1, 2, 4, 2, 2, 0, 3, 

In [93]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(balanced_texts)
sequences = tokenizer.texts_to_sequences(balanced_texts)
data = pad_sequences(sequences, maxlen=300)

In [114]:
from keras.utils import to_categorical

model = Sequential()
model.add(Embedding(20000, 128, input_length=300))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'],)
newLabels=np.array(balanced_labels)
newLabels=to_categorical(newLabels)
print(newLabels.shape)

(100000, 5)


In [115]:
history=model.fit(data, newLabels, validation_split=0.2, epochs=3)

Train on 80000 samples, validate on 20000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [122]:
binstars = [0 if review['stars'] <= 3 else 1 for review in reviews]
bin_texts = []
bin_labels = []
limit = 50000  # Change this to grow/shrink the dataset
neg_pos_counts = [0, 0]
for i in range(len(texts)):
    polarity = binstars[i]
    if neg_pos_counts[polarity] < limit:
        bin_texts.append(texts[i])
        bin_labels.append(binstars[i])
        neg_pos_counts[polarity] += 1

In [123]:
model = Sequential()
model.add(Embedding(20000, 128, input_length=300))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'],)
print(len(bin_labels))

100000


In [124]:
history=model.fit(data, np.array(bin_labels), validation_split=0.5, epochs=3)

Train on 50000 samples, validate on 50000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [127]:
# BIGGER FOR 5 STARS

texts = [review['text'] for review in reviews]
 
# Convert our 5 classes into 2 (negative or positive)
numStars = [review['stars'] for review in reviews]
balanced_texts = []
balanced_labels = []
limit = 100000  # Change this to grow/shrink the dataset
pos_counts = [0, 0, 0, 0, 0]
for i in range(len(texts)):
    score = numStars[i]-1
    if pos_counts[score] < limit:
        balanced_texts.append(texts[i])
        balanced_labels.append(numStars[i]-1)
        pos_counts[score] += 1
        
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(balanced_texts)
sequences = tokenizer.texts_to_sequences(balanced_texts)
data = pad_sequences(sequences, maxlen=300)
        
from keras.utils import to_categorical

model = Sequential()
model.add(Embedding(20000, 128, input_length=300))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'],)
newLabels=np.array(balanced_labels)
newLabels=to_categorical(newLabels)
print(newLabels.shape)
print(data.shape)

history_big=model.fit(data, newLabels, validation_split=0.4, epochs=5)

(500000, 5)
(500000, 300)
Train on 300000 samples, validate on 200000 samples
Epoch 1/5
 14240/300000 [>.............................] - ETA: 1:57:14 - loss: 1.3001 - acc: 0.4321

KeyboardInterrupt: 