In [None]:
import os
# Find the latest version of spark 2.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-2.4.7'
# spark_version = 'spark-2.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()



Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:6 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:7 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:11 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:12 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease
Get:13 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Get:14 http://archive.ubuntu.com/ubuntu bionic-updates/multiverse amd64 Packages [34.0 kB]
Get:15 http

In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("YelpReview").getOrCreate()

In [None]:
from pyspark.sql.functions import col, udf,length, size
from pyspark.sql.types import StringType

### Data cleaning

In [None]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://usc-bootcamp-yelpreview-text-analysis.s3.us-east-2.amazonaws.com/reviews.csv"
spark.sparkContext.addFile(url)
raw_df = spark.read.csv(SparkFiles.get("reviews.csv"), sep=",", header=True)

# Show DataFrame
raw_df.show()

+--------------------+-------------+------------+
|             reviews|       rating|review_count|
+--------------------+-------------+------------+
|Panda Express was...|5 star rating|          63|
|The dude and I ca...|5 star rating|          63|
|I ordered 5 total...|1 star rating|          63|
|I always order Pa...|3 star rating|          63|
|Decided to try Pa...|5 star rating|          63|
|I've never had a ...|4 star rating|          63|
|The family meal d...|1 star rating|          63|
|Quality has sever...|2 star rating|          63|
|Paid for a bowl a...|1 star rating|          63|
|Order a bowl with...|1 star rating|          63|
|Went through the ...|1 star rating|          63|
|When I think of p...|2 star rating|          63|
|Horrible is a und...|1 star rating|          63|
|Yes the drive thr...|2 star rating|          63|
|Okay..so Panda is...|4 star rating|          63|
|Going through Dri...|2 star rating|          63|
|My entrees were a...|2 star rating|          63|


In [None]:
# new column function - reduce dimension of rating column into 3 categories
def rating_category(rating:str)->str:
  """create new column for label
  """
  if rating in ["1 star rating"]:
      return "bad"
  elif rating in ["2 star rating", "3 star rating"]:
      return "descent"
  else: 
      return "good"

assert rating_category("1 star rating")=="bad"


In [None]:
# Store a user defined function
convert_rating = udf(rating_category, StringType())
convert_rating

<function __main__.rating_category>

In [None]:
# add new column
selected_df = raw_df.withColumn("output_label", convert_rating(col("rating")))
selected_df = selected_df.withColumn("length", length(selected_df["reviews"]))
selected_df.show()

+--------------------+-------------+------------+------------+------+
|             reviews|       rating|review_count|output_label|length|
+--------------------+-------------+------------+------------+------+
|Panda Express was...|5 star rating|          63|        good|   334|
|The dude and I ca...|5 star rating|          63|        good|   770|
|I ordered 5 total...|1 star rating|          63|         bad|   151|
|I always order Pa...|3 star rating|          63|     descent|   628|
|Decided to try Pa...|5 star rating|          63|        good|   261|
|I've never had a ...|4 star rating|          63|        good|   640|
|The family meal d...|1 star rating|          63|         bad|   129|
|Quality has sever...|2 star rating|          63|     descent|   350|
|Paid for a bowl a...|1 star rating|          63|         bad|   158|
|Order a bowl with...|1 star rating|          63|         bad|   151|
|Went through the ...|1 star rating|          63|         bad|   675|
|When I think of p..

### Feature Transformation

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer


In [None]:
# create token and remove stop words in order to find out stopwords percentage
tokenizer = Tokenizer(inputCol="reviews", outputCol="token")
selected_df = tokenizer.transform(selected_df)
stop_word_remover = StopWordsRemover(inputCol="token", outputCol="filtered_token")
selected_df = stop_word_remover.transform(selected_df)
selected_df.show()

+--------------------+-------------+------------+------------+------+--------------------+--------------------+
|             reviews|       rating|review_count|output_label|length|               token|      filtered_token|
+--------------------+-------------+------------+------------+------+--------------------+--------------------+
|Panda Express was...|5 star rating|          63|        good|   334|[panda, express, ...|[panda, express, ...|
|The dude and I ca...|5 star rating|          63|        good|   770|[the, dude, and, ...|[dude, came, pand...|
|I ordered 5 total...|1 star rating|          63|         bad|   151|[i, ordered, 5, t...|[ordered, 5, tota...|
|I always order Pa...|3 star rating|          63|     descent|   628|[i, always, order...|[always, order, p...|
|Decided to try Pa...|5 star rating|          63|        good|   261|[decided, to, try...|[decided, try, pa...|
|I've never had a ...|4 star rating|          63|        good|   640|[i've, never, had...|[never, bad, e

In [None]:
selected_df = selected_df.withColumn("stopwords_count", size(selected_df["filtered_token"]))
selected_df = selected_df.withColumn("stopwords_percent", selected_df["stopwords_count"]/selected_df["length"])
selected_df.show()

+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+
|             reviews|       rating|review_count|output_label|length|               token|      filtered_token|stopwords_count|  stopwords_percent|
+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+
|Panda Express was...|5 star rating|          63|        good|   334|[panda, express, ...|[panda, express, ...|             37|0.11077844311377245|
|The dude and I ca...|5 star rating|          63|        good|   770|[the, dude, and, ...|[dude, came, pand...|             80| 0.1038961038961039|
|I ordered 5 total...|1 star rating|          63|         bad|   151|[i, ordered, 5, t...|[ordered, 5, tota...|             22| 0.1456953642384106|
|I always order Pa...|3 star rating|          63|     descent|   628|[i, always, order...|[always, order, p...| 

In [None]:
# create all features to the dataset
label_encoder = StringIndexer(inputCol="output_label", outputCol="label")
hasher = HashingTF(inputCol="filtered_token", outputCol="hashed_token")
idf = IDF(inputCol="hashed_token", outputCol="idf_token")


In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
vectorizer = VectorAssembler(inputCols = ["idf_token", "length","stopwords_percent"], outputCol = "features")


### Create a Pipeline to Automate The Data Transformations

In [None]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[label_encoder, hasher, idf, vectorizer])


In [None]:
# fit and transform data with pipeline
pipeline_model = pipeline.fit(selected_df)
cleaned_df = pipeline_model.transform(selected_df)
cleaned_df.show()

+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+-----+--------------------+--------------------+--------------------+
|             reviews|       rating|review_count|output_label|length|               token|      filtered_token|stopwords_count|  stopwords_percent|label|        hashed_token|           idf_token|            features|
+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+-----+--------------------+--------------------+--------------------+
|Panda Express was...|5 star rating|          63|        good|   334|[panda, express, ...|[panda, express, ...|             37|0.11077844311377245|  1.0|(262144,[2711,610...|(262144,[2711,610...|(262146,[2711,610...|
|The dude and I ca...|5 star rating|          63|        good|   770|[the, dude, and, ...|[dude, came, pand...|             80| 0.10

### Create training and testing dataset

In [None]:
from pyspark.ml.classification import NaiveBayes

# Break data down into a training set and a testing set
training, testing = cleaned_df.randomSplit([0.7, 0.3], seed = 43)

### Fit and predict NaiveBaye model

In [None]:
# Create a Naive Bayes model and fit training data
model = NaiveBayes()
predictor = model.fit(training)


In [None]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|             reviews|       rating|review_count|output_label|length|               token|      filtered_token|stopwords_count|  stopwords_percent|label|        hashed_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|"""I never want t...|2 star rating|          29|     descent|  1410|["""i, never, wan...|["""i, never, wan...|            141|                0.1|  2.0|(262144,[14,4200,...|(262144,[14,4200

In [None]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.671084


In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
# Create (prediction, label) pairs
predictionAndLabel = test_results.select("prediction", "label").rdd

# Generate confusion matrix
metrics = MulticlassMetrics(predictionAndLabel)
print(metrics.confusionMatrix())

DenseMatrix([[2067.,   46.,  286.],
             [ 219., 1115.,  259.],
             [ 702.,  179.,  462.]])


### Fit and predict with RandomForest Model

In [None]:
from pyspark.ml.classification import RandomForestClassifier


In [None]:
# create randomforest model and fit into training dataset
rf_model = RandomForestClassifier()
rf_predictor = rf_model.fit(training)


In [None]:
# transform the model with testing data 
# Tranform the model with the testing data
test_results = rf_predictor.transform(testing)
test_results.show(5)



+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|             reviews|       rating|review_count|output_label|length|               token|      filtered_token|stopwords_count|  stopwords_percent|label|        hashed_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|"""I never want t...|2 star rating|          29|     descent|  1410|["""i, never, wan...|["""i, never, wan...|            141|                0.1|  2.0|(262144,[14,4200,...|(262144,[14,4200

In [None]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
rf_acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % rf_acc)

Accuracy of model at predicting reviews was: 0.280607


In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
# Create (prediction, label) pairs
predictionAndLabel = test_results.select("prediction", "label").rdd

# Generate confusion matrix
metrics = MulticlassMetrics(predictionAndLabel)
print(metrics.confusionMatrix())

ModuleNotFoundError: ignored

### Sentimental Analysis with RNN

In [None]:
import pandas as pd
import numpy as np

In [None]:
reviews = pd.read_csv(url)
reviews = reviews.iloc[:,0:2]
reviews.head()

Unnamed: 0,reviews,rating
0,Panda Express was on point tonight! I ordered ...,5 star rating
1,The dude and I came to this Panda Express arou...,5 star rating
2,I ordered 5 total plates fried rice chow mai...,1 star rating
3,I always order Panda Express from here and the...,3 star rating
4,Decided to try Panda Expess one more time.Corp...,5 star rating


### Clean up data, Changing rating column

In [None]:
# use defined function above to re-create rating column
reviews["rating"] = reviews["rating"].apply(rating_category)
reviews.head()

Unnamed: 0,reviews,rating
0,Panda Express was on point tonight! I ordered ...,good
1,The dude and I came to this Panda Express arou...,good
2,I ordered 5 total plates fried rice chow mai...,bad
3,I always order Panda Express from here and the...,descent
4,Decided to try Panda Expess one more time.Corp...,good


### Feature Engineer

In [None]:
# create word count, and character count
reviews["word_list"] = reviews["reviews"].apply(lambda x: x.split())
reviews["word_count"] = reviews["word_list"].apply(lambda x: len(x))
reviews["char_count"] = reviews["reviews"].apply(lambda x: len(x))
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261


In [None]:
# look at average length of each word in each review
def average_word_length(word_list)->int:
    """calculate the average word length in each review
    """
    word_length = []
    for word in word_list: 
        word_length.append(len(word))
    return np.mean(word_length)

assert average_word_length(["test", "test12"])==5


In [None]:
reviews["average_word_length"] = reviews["word_list"].apply(average_word_length)
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count,average_word_length
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334,4.253968
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770,4.167785
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151,4.357143
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628,4.155738
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261,5.390244


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# remove stop words, and get stopwords percentage
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

reviews["stop_word_count"] = reviews["word_list"].apply(lambda x: len([word for word in x if word.lower() in stop_words]))
reviews["stop_word_percent"] = reviews["stop_word_count"]/reviews["word_count"]
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count,average_word_length,stop_word_count,stop_word_percent
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334,4.253968,28,0.444444
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770,4.167785,69,0.463087
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151,4.357143,8,0.285714
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628,4.155738,59,0.483607
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261,5.390244,8,0.195122


### Data Cleaning

In [None]:
# lower case all the reviews, and remove all the punctuations
reviews["lowercase_reviews"] = reviews["reviews"].apply(lambda x: " ".join(word.lower() for word in x.split()))
reviews["lowercase_reviews"] = reviews["lowercase_reviews"].str.replace('[^\w\s]',"")
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count,average_word_length,stop_word_count,stop_word_percent,lowercase_reviews
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334,4.253968,28,0.444444,panda express was on point tonight i ordered t...
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770,4.167785,69,0.463087,the dude and i came to this panda express arou...
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151,4.357143,8,0.285714,i ordered 5 total plates fried rice chow main ...
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628,4.155738,59,0.483607,i always order panda express from here and the...
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261,5.390244,8,0.195122,decided to try panda expess one more timecorpo...


In [None]:
# remove stop words
reviews["clean_reviews"] = reviews["lowercase_reviews"].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count,average_word_length,stop_word_count,stop_word_percent,lowercase_reviews,clean_reviews
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334,4.253968,28,0.444444,panda express was on point tonight i ordered t...,panda express point tonight ordered mobile app...
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770,4.167785,69,0.463087,the dude and i came to this panda express arou...,dude came panda express around end july 2020 h...
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151,4.357143,8,0.285714,i ordered 5 total plates fried rice chow main ...,ordered 5 total plates fried rice chow main do...
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628,4.155738,59,0.483607,i always order panda express from here and the...,always order panda express problem ever place ...
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261,5.390244,8,0.195122,decided to try panda expess one more timecorpo...,decided try panda expess one timecorporations ...


In [None]:
# find out the frequency of words, and remove nonsense words
pd.Series(" ".join(reviews["clean_reviews"]).split()).value_counts()[:60]

food          18670
panda         10282
chicken        9602
order          8020
time           6692
express        6561
get            6174
like           5817
service        5807
one            5682
good           5472
location       5390
rice           4913
go             4619
place          4271
got            4220
back           4102
always         4042
dont           3947
would          3922
orange         3713
ordered        3615
even           3272
minutes        3202
wait           3097
never          3039
customer       3030
didnt          2999
im             2988
drive          2867
line           2754
really         2722
people         2719
went           2719
said           2699
fresh          2668
asked          2665
great          2653
beef           2649
staff          2556
chinese        2548
give           2488
fast           2478
ive            2418
chow           2246
also           2226
told           2181
eat            2169
come           2151
mein           2124


In [None]:
other_stop_words = ["food", "panda", "got", "im","ive","come", "restaurant", "express"]
reviews["clean_reviews"] = reviews["clean_reviews"].apply(lambda x: " ".join(word for word in x.split() if word not in other_stop_words))
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count,average_word_length,stop_word_count,stop_word_percent,lowercase_reviews,clean_reviews
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334,4.253968,28,0.444444,panda express was on point tonight i ordered t...,point tonight ordered mobile app picked order ...
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770,4.167785,69,0.463087,the dude and i came to this panda express arou...,dude came around end july 2020 havent use goin...
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151,4.357143,8,0.285714,i ordered 5 total plates fried rice chow main ...,ordered 5 total plates fried rice chow main do...
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628,4.155738,59,0.483607,i always order panda express from here and the...,always order problem ever place order plate sm...
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261,5.390244,8,0.195122,decided to try panda expess one more timecorpo...,decided try expess one timecorporations sent c...


In [None]:
results = set()
reviews["clean_reviews"].str.split().apply(results.update)
len(results)


29656

In [None]:
# get final dataset
final_dataset = reviews.iloc[:,[1,3,4,5,7,9]]
final_dataset.head()

Unnamed: 0,rating,word_count,char_count,average_word_length,stop_word_percent,clean_reviews
0,good,63,334,4.253968,0.444444,point tonight ordered mobile app picked order ...
1,good,149,770,4.167785,0.463087,dude came around end july 2020 havent use goin...
2,bad,28,151,4.357143,0.285714,ordered 5 total plates fried rice chow main do...
3,descent,122,628,4.155738,0.483607,always order problem ever place order plate sm...
4,good,41,261,5.390244,0.195122,decided try expess one timecorporations sent c...


In [None]:
temp_dataset = final_dataset[["clean_reviews","rating"]]
temp_dataset.head()

Unnamed: 0,clean_reviews,rating
0,point tonight ordered mobile app picked order ...,good
1,dude came around end july 2020 havent use goin...,good
2,ordered 5 total plates fried rice chow main do...,bad
3,always order problem ever place order plate sm...,descent
4,decided try expess one timecorporations sent c...,good


In [None]:
data_size = len(temp_dataset["clean_reviews"])
data_size

17620

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

label_encoder = LabelEncoder()
label_encoder.fit(temp_dataset["rating"])
encoded_y = label_encoder.transform(temp_dataset["rating"])
y_categorical = to_categorical(encoded_y)

y_categorical

In [None]:
import tensorflow as tf
tf_dataset = tf.data.Dataset.from_tensor_slices(
    (
        tf.cast(temp_dataset["clean_reviews"].values,tf.string), 
        tf.cast(y_categorical, tf.float64), 
    )
)



In [None]:
for X_batch, y_batch in tf_dataset.batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label)
        print()

Review: point tonight ordered mobile app picked order pick time ready time double checked order correct gave us extra utensils sauce hot fresh superb customer service well best experience ...
Label: [0. 0. 1.]

Review: dude came around end july 2020 havent use going inside lot drive thrus surprised stumbled upon one one usually go closed nightwe went drive thru quick process plate super greens along honey walnut shr ...
Label: [0. 0. 1.]



In [None]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 1000)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [None]:
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(2, 72), dtype=string, numpy=
 array([[b'point', b'tonight', b'ordered', b'mobile', b'app', b'picked',
         b'order', b'pick', b'time', b'ready', b'time', b'double',
         b'checked', b'order', b'correct', b'gave', b'us', b'extra',
         b'utensils', b'sauce', b'hot', b'fresh', b'superb', b'customer',
         b'service', b'well', b'best', b'experience', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>'],
        [b'dude', b'came', b'around', b'end', b'july', b'havent', b'use',
         b'going', b'inside', b'lot'

In [None]:
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch in tf_dataset.batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [None]:
len(vocabulary)

23083

In [None]:
# vocab_size can be a parameter to be tunned
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]


In [None]:
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}
for word in b"I like orange chicken patchouli".split():
    print(word_to_id.get(word) or vocab_size)

10000
9
15
1
10000


In [None]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)


In [None]:
train_size = int(.7*data_size)
test_size = int(.3*data_size)

In [None]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

train_set = tf_dataset.take(train_size).repeat().batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)



In [None]:
test_set = tf_dataset.skip(train_size).repeat().batch(32).map(preprocess)
test_set = test_set.map(encode_words).prefetch(1)


In [None]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)
    

tf.Tensor(
[[ 382  261   14 ...    0    0    0]
 [1920   58   91 ...    1  456  783]
 [  14  565  290 ...    0    0    0]
 ...
 [  24  990  353 ...    0    0    0]
 [  17  193  931 ...    0    0    0]
 [ 129   32   13 ...    0    0    0]], shape=(32, 49), dtype=int64)
tf.Tensor(
[[0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]], shape=(32, 3), dtype=float64)


In [None]:
train_size = int(.7*data_size)
test_size = int(.3*data_size)

train_dataset = tf_dataset.take(train_size)
test_dataset = tf_dataset.skip(train_size)


In [None]:
train_size

12334

In [None]:
train_dataset

<TakeDataset shapes: ((), (3,)), types: (tf.string, tf.float64)>

In [None]:
embed_size = 128
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    tf.keras.layers.LSTM(256, return_sequences=True),
    tf.keras.layers.LSTM(256),
    tf.keras.layers.Dense(3, activation="softmax")
])



In [None]:
adam = optimizers.Adam(lr = 0.001)


In [None]:
model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=["accuracy"])


In [None]:
history = model.fit(train_set, steps_per_epoch=train_size // 32, epochs=5, validation_data= test_set, validation_steps=test_size//32)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Tokenize clean_reviews

In [None]:
# tokenize the data
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer

# tokenize
tokenizer = Tokenizer(num_words = 29656, lower= True)
tokenizer.fit_on_texts(final_dataset["clean_reviews"])
final_dataset["clean_reviews"] = tokenizer.texts_to_sequences(final_dataset["clean_reviews"])

vocab_size = len(tokenizer.word_index)+1
final_dataset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,rating,word_count,char_count,average_word_length,stop_word_percent,clean_reviews
0,good,63,334,4.253968,0.444444,"[315, 330, 18, 1219, 468, 570, 2, 133, 3, 90, ..."
1,good,149,770,4.167785,0.463087,"[1731, 64, 97, 430, 2079, 2284, 510, 290, 45, ..."
2,bad,28,151,4.357143,0.285714,"[18, 116, 555, 314, 51, 10, 39, 427, 629, 17, ..."
3,descent,122,628,4.155738,0.483607,"[14, 2, 284, 72, 12, 2, 58, 127, 91, 101, 121,..."
4,good,41,261,5.390244,0.195122,"[219, 122, 6374, 7, 12393, 992, 2805, 94, 212,..."


In [None]:
final_dataset["clean_reviews"]

### Creating Training and Testing dataset

In [None]:
# separate train, test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, X_aux_train, X_aux_test, y_train, y_test = train_test_split(final_dataset["clean_reviews"], final_dataset["stop_word_percent"], final_dataset["rating"], test_size = 0.3, random_state = 42)
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")


Shape of X_train: (12334,)
Shape of X_test: (5286,)
Shape of y_train: (12334,)
Shape of y_test: (5286,)


In [None]:
X_aux_test.shape

(5286,)

### Scale the data and create one-hot-encoding for rating column

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
# create padding
from keras_preprocessing.sequence import pad_sequences
maxlen = max(final_dataset["clean_reviews"].apply(lambda x: len(x)))

X_train = pad_sequences(X_train, padding="post", maxlen=maxlen)
X_test = pad_sequences(X_test, padding="post", maxlen=maxlen)


In [None]:
from keras.layers import Dense, Dropout, Activation, SimpleRNN, LSTM, Embedding, GRU

In [None]:
maxlen

423

In [None]:
# reshape the data
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Activation, SimpleRNN, LSTM
# from keras import optimizers
# from keras.preprocessing.sequence import pad_sequences

# X_train = np.array(X_train).reshape((X_train.shape[0], X_train.shape[1],1))
# X_test = np.array(X_test).reshape((X_test.shape[0], X_test.shape[1],1))
# X_aux_train = np.array(X_aux_train).reshape((X_aux_train.shape[0],1))
# X_aux_test = np.array(X_aux_test).reshape((X_aux_test.shape[0],1))


# print(X_train.shape)
# print(X_test.shape)
# print(X_aux_train.shape)


(12334, 303, 1)
(5286, 303, 1)
(12334, 1)


In [None]:
# reshape the data
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, SimpleRNN, LSTM
from keras import optimizers
from keras.preprocessing.sequence import pad_sequences

X_train = np.array(X_train).reshape((X_train.shape[0], X_train.shape[1]))
X_test = np.array(X_test).reshape((X_test.shape[0], X_test.shape[1]))
X_aux_train = np.array(X_aux_train).reshape((X_aux_train.shape[0]))
X_aux_test = np.array(X_aux_test).reshape((X_aux_test.shape[0]))


print(X_train.shape)
print(X_test.shape)
print(X_aux_train.shape)

(12334, 423)
(5286, 423)
(12334,)


### Fit the RNN model

In [None]:
num_classes = 3

def rnn_model():
  model = Sequential(name = "RNN_Model")
  model.add(Embedding(tokenizer.num_words, 128, input_shape = [None]))
  model.add(LSTM(128, return_sequences=True))
  model.add(LSTM(128, return_sequences=False))
  model.add(Dense(num_classes, activation='softmax', name = "output"))
  model.summary()

  adam = optimizers.Adam(lr = 0.001)
  model.compile(loss = "categorical_crossentropy", optimizer = adam, metrics = ["accuracy"])
  return model

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
model = KerasClassifier(build_fn = rnn_model, epochs = 20, batch_size = 16)


In [None]:
model.fit(X_train, y_train_categorical, validation_data = (X_test, y_test_categorical))

Model: "RNN_Model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, None, 128)         3795968   
_________________________________________________________________
lstm_23 (LSTM)               (None, None, 128)         131584    
_________________________________________________________________
lstm_24 (LSTM)               (None, 128)               131584    
_________________________________________________________________
output (Dense)               (None, 3)                 387       
Total params: 4,059,523
Trainable params: 4,059,523
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20

KeyboardInterrupt: ignored

In [None]:
def read_data(url):
    """read in data from a url
    """
    