In [None]:
import os
import re
import json
import gzip
import nltk
import pydot
import shutil
import string
import sklearn
import collections
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib as mpl
from google.colab import files
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
# from keras.utils import to_categorical
from tensorflow.keras import regularizers
from tensorflow.keras import preprocessing
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Electronics.json.gz     
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/AMAZON_FASHION.json.gz
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Appliances.json.gz
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Clothing_Shoes_and_Jewelry.json.gz
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Home_and_Kitchen.json.gz

In [None]:
# Defining functions
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def chkList(lst):
    return len(set(lst)) == 1

def plot_graphs(history, string, plotname='plot.png'):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.savefig(plotname)
  plt.show()

In [None]:
# Defining some constants. 
learning_rate = 1e-3
dropout = 0.4
max_features =10000000
embedding_dim =16
sequence_length = 100
batch_size =256
epochs = 25
each_category=1000
word_count_threshold= 5
BUFFER_SIZE = 800
BATCH_SIZE = 256
model_name= 'CommentAnalysis_RNN7'

In [None]:
# Extracting data from data files
comments=[]
ratings=[]    
data_files=os.listdir(os.getcwd())            
for data in data_files:
  if data[-2:] == 'gz':
    print("Extracting data from file: " + data)
    data_from_each_category =[each_category,0,0,0,0,0]
    with gzip.open(data) as f:
        for l in f:
            i=(json.loads(l.strip()))
            if i.__contains__('overall') and  i.__contains__('reviewText'):
                if(data_from_each_category[int(i['overall'])] < each_category):
                    ratings.append(int(i['overall']))
                    comments.append(' '.join(i['reviewText'].split()))
                    data_from_each_category[int(i['overall'])] = data_from_each_category[int(i['overall'])]+1
            if(chkList(data_from_each_category) and data_from_each_category[1] == each_category):
                break
        print('Data of each class from  data file '+ data + ':: ' +str(data_from_each_category))

In [None]:
from keras.utils.np_utils import to_categorical
ratings=to_categorical(ratings)

In [None]:
# Downloading stopswords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
word_freq = {}
for comment in comments:
    comment = clean_str(comment)
    words = comment.split()
    for word in words:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

In [None]:
clean_comments = []
for comment in comments:
    comment = clean_str(comment)
    words = comment.split()
    doc_words = []
    for word in words:
        if word not in stop_words and word_freq[word] >= word_count_threshold:
          doc_words.append(word)
    doc_str = ' '.join(doc_words).strip()
    clean_comments.append(doc_str)

In [None]:
print('Size of ratings list: '+str(len(ratings)))
print('Size of comments list: '+str(len(clean_comments)))

In [None]:
# Tokenization of words
num_words = len(word_freq)
tokenizer = Tokenizer(num_words=num_words,oov_token="unk")
tokenizer.fit_on_texts(clean_comments)
print('Example of tokenizer: '+ str(tokenizer.texts_to_sequences(['daughter thought good read'])))

In [None]:

#  Spliting data into test data and train data
X_train, X_test, y_train, y_test = train_test_split(clean_comments,ratings,test_size=0.40,stratify = ratings,random_state=0)

#  Spliting  test data into test data and validation data 
X_test, X_valid, y_test, y_valid = train_test_split(X_test,y_test,stratify = y_test,test_size=0.50,random_state=0)


In [None]:
# Conversion from texts to sequences of train data and adding padding
x_train = np.array(X_train)
train_labels = np.asarray(y_train)

In [None]:
# Conversion from texts to sequences of validation data and adding padding
x_valid = np.array(X_valid)
valid_labels = np.asarray(y_valid)

In [None]:
# Conversion from texts to sequences of test data and adding padding
x_test = np.array(X_test)
test_labels = np.asarray(y_test)

In [None]:
# Coversion of datas into tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((x_train,train_labels))
valid_dataset = tf.data.Dataset.from_tensor_slices((x_valid,valid_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((x_test,test_labels))

In [None]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

In [None]:
VOCAB_SIZE= len(word_freq)
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE,output_sequence_length=140)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [None]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

In [None]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 150, mask_zero=True),
  	tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(120,dropout=0.2, recurrent_dropout=0.2,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,dropout=0.2, recurrent_dropout=0.2)),
    tf.keras.layers.Dense(84, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(6,activation='softmax',use_bias=True)
])

In [None]:
model.compile(loss='categorical_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate),metrics=['accuracy'])

In [None]:
history = model.fit(train_dataset, epochs=epochs,validation_data=test_dataset,validation_steps=20)

In [None]:
test_results = model.evaluate(test_dataset, batch_size=128)
print(test_results)

In [None]:
model.summary()

In [None]:
history.history                         

In [None]:
plot_graphs(history, "accuracy","accuracy.jpg")
plot_graphs(history, "loss","loss.jpg")

In [None]:
pred_test_label = []
labels = [0,1,2,3,4,5]
test_pred = model.predict(test_dataset)
for i in test_pred:
  pred_test_label.append(np.argmax(i))

In [None]:
# Saving model for future use 
model.save(model_name+'.tf') 