In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.probability import FreqDist
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_pickle('amazon_review_data.pd')

In [3]:
df = df.sample(frac=1)

In [4]:
# Convert rating from double to int
df['rating'] = df['rating'].astype(int)

In [5]:
df['rating']

1055634    5
1137634    5
284960     5
1193986    4
115403     5
          ..
531943     5
1021656    4
388207     4
1035452    4
518937     1
Name: rating, Length: 1228064, dtype: int32

In [6]:
rating_1_df = df.loc[df['rating'] == 1]
rating_2_df = df.loc[df['rating'] == 2]
rating_3_df = df.loc[df['rating'] == 3]
rating_4_df = df.loc[df['rating'] == 4]
rating_5_df = df.loc[df['rating'] == 5]

In [7]:
rating_negative_df = rating_3_df
rating_negative_df = rating_negative_df.append(rating_2_df)
rating_negative_df = rating_negative_df.append(rating_1_df)

rating_positive_df = rating_5_df
rating_positive_df = rating_positive_df.append(rating_4_df)

rating_negative_df['rating'] = 0
rating_positive_df['rating'] = 1

### Run one of the following cells

In [8]:
# numValues = 55000
# df.drop(df.index, inplace=True)
# df = df.append(rating_1_df[:numValues])
# df = df.append(rating_2_df[:numValues])
# df = df.append(rating_3_df[:numValues])
# df = df.append(rating_4_df[:numValues])
# df = df.append(rating_5_df[:numValues])
# df = df.sample(frac=1)

In [9]:
numValues = min(len(rating_positive_df),len(rating_negative_df))
df.drop(df.index, inplace=True)
df = df.append(rating_positive_df[:numValues])
df = df.append(rating_negative_df[:numValues])
df = df.sample(frac=1)

### Data splitting

In [10]:
# subset_split = 50000
# max_features = 10000
# train_X,test_X,train_y,test_y = train_test_split(df['reviews'][:subset_split], df['rating'][:subset_split], test_size=0.20, random_state=42)
# train_len,test_len = len(train_X),len(test_X)
# vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features,strip_accents='unicode', norm='l2')
# train_X = vectorizer.fit_transform(train_X).todense()
# test_X = vectorizer.transform(test_X).todense()

In [11]:
import re
from tqdm import tqdm
from sklearn.utils import shuffle
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import bz2
from keras.layers import *
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

### USING https://www.tensorflow.org/tutorials/text/text_classification_rnn

In [12]:
len(df)

478382

In [13]:
subset_split = 450_000
train_X,test_X,train_y,test_y = train_test_split(df['reviews'][:subset_split], df['rating'][:subset_split], test_size=0.20, random_state=42)

In [14]:
VOCAB_SIZE=5_000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(np.array(test_X))

In [15]:
ltsm_model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])
ltsm_model.compile(loss=tf.keras.losses.mse,
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [15]:
with tf.device('GPU:0'):
    ltsm_model.fit(train_X, train_y, epochs=5)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [21]:
from keras.models import load_model
ltsm_model.save('ltsm_model_5_epoch_better.tf')

INFO:tensorflow:Assets written to: ltsm_model_86.tf\assets
INFO:tensorflow:Assets written to: ltsm_model_86.tf\assets


In [None]:
pred = ltsm_model.predict(test_X)
pred_0_1 = []
for y in pred:
    if y > .5:
        pred_0_1.append(1)
    else:
        pred_0_1.append(0)
accuracy_score(test_y[:s_ubset],pred_0_1)

In [None]:
# print(history.model.predict(["yeah"]))

In [23]:
# import keras
# reconstructed_model = keras.models.load_model("ltsm_model_86.tf")

In [47]:
# reconstructed_model.predict(["I love Apple and Android"])

array([[0.8549012]], dtype=float32)

### Using Tutorial https://www.kaggle.com/kevinautin/fully-convolutional-accuracy-94-4-15-min

In [None]:
# max_features = 8192
# maxlen = 128
embed_size = 64
# subset_split = 125000
# train_X,test_X,train_y,test_y = train_test_split(df['reviews'][:subset_split], df['rating'][:subset_split], test_size=0.20)
# tokenizer = Tokenizer(num_words=max_features)

# token_train = tokenizer.texts_to_sequences(train_X)
# token_test = tokenizer.texts_to_sequences(test_X)

# train_X = pad_sequences(token_train, maxlen=maxlen, padding='post')
# test_X = pad_sequences(token_test, maxlen=maxlen, padding='post')

In [None]:
# input = Input(shape=(max_features,))
# net = Embedding(max_features, embed_size)(input)
# net = Dropout(0.2)(net)
# net = BatchNormalization()(net)

# net = Conv1D(32, 7, padding='same', activation='relu')(net)
# net = BatchNormalization()(net)
# net = Conv1D(32, 3, padding='same', activation='relu')(net)
# net = BatchNormalization()(net)
# net = Conv1D(32, 3, padding='same', activation='relu')(net)
# net = BatchNormalization()(net)
# net = Conv1D(32, 3, padding='same', activation='relu')(net)
# net1 = BatchNormalization()(net)

# net = Conv1D(1, 10000)(net)
# net = GlobalAveragePooling1D()(net)
# output = Activation('relu')(net)
# model = Model(inputs = input, outputs = output)
# model.compile(optimizer='adam', loss='mse', metrics=['acc'])
# model.summary()

In [None]:
# with tf.device('GPU:0'):
#     model.fit(train_X, train_y, batch_size=64, epochs=5, validation_split=0.1)

### Deep Learning Model

In [None]:
# import numpy as np
# import tensorflow as tf
# import keras
# from keras.models import Sequential
# from keras.layers.core import Dense, Dropout, Activation
# from keras.layers import Conv1D,MaxPooling1D,Flatten
# from keras.optimizers import Adadelta,Adam,RMSprop
# from keras.utils import np_utils

In [None]:
# from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())

In [None]:
# model = keras.Sequential()
# model.add(Dense(1024,activation="relu", input_dim=25000))
# model.add(Dropout(0.5))
# model.add(Dense(512, activation="relu"))
# model.add(Dropout(0.5))
# model.add(Dense(64, activation="relu"))
# model.add(Dropout(0.5))
# model.add(Dense(32, activation="relu"))
# model.add(Dropout(0.5))
# model.add(Dense(1,activation='relu'))
# model.compile(optimizer='adam',
#               loss='mse',
#               metrics=['accuracy'])

In [None]:
# # model.fit(train_X[:15000],Y_train[:15000], batch_size=64, epochs=10,verbose=1)
# with tf.device('GPU:0'):
#     model.fit(train_X,train_y, epochs=10)

In [None]:
# # accuracy_score(test_y[:1000],model.predict(test_X[:1000]))
# # s_ubset = 1000
# s_ubset = len(test_y)
# print(s_ubset)
# pred = model.predict(test_X[:s_ubset])
# # pred_t = model.predict(train_X[:100])
# test_y = test_y.reset_index(drop=True)
# # train_y = train_y.reset_index(drop=True)

# pred_0_1 = []
# for y in pred:
#     if y > .5:
#         pred_0_1.append(1)
#     else:
#         pred_0_1.append(0)
# accuracy_score(test_y[:s_ubset],pred_0_1)

# # pred = [for y in pred]
# # for i in range(99):
# #     print(pred[i][0],test_y[i])
# # for i in range(99):
# #     print(pred_t[i][0],train_y[i])

### SVM classifier

In [None]:
# # # clf = MultinomialNB().fit(train_X, train_y[:100000])
# subset_data = 1000
# from sklearn import svm
# clf = svm.SVC()
# clf.fit(train_X[:subset_data], train_y[:subset_data])

In [None]:
# titles_options = [("Confusion matrix, without normalization", None),
#                   ("Normalized confusion matrix", 'true')]
# for title, normalize in titles_options:
#     disp = plot_confusion_matrix(clf, test_X[:1000], test_y[:1000],
#                                  display_labels=np.arange(1,3),
#                                  cmap=plt.cm.Blues,
#                                  normalize=normalize)
#     disp.ax_.set_title(title)

#     print(title)
#     print(disp.confusion_matrix)
# plt.show()

In [None]:
# text = ["This is a product that is okay"]
# # # vectorizer = TfidfVectorizer()
# # # train_X = vectorizer.fit_transform(train_X[:10000])
# ya_yeet = vectorizer.transform(text).todense()
# # clf.predict(ya_yeet)
# model.predict(ya_yeet)