In [19]:
import os
# Find the latest version of spark 2.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-2.4.7'
# spark_version = 'spark-2.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()



0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [Waiting for headers] [Wa                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease
0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Waiting f0% [2 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Wait                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [2 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Wait                                                                               Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [2 InRelease gpgv 3,626 B] [Waiting for headers] [4 InRele

In [20]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("YelpReview").getOrCreate()

In [21]:
from pyspark.sql.functions import col, udf,length, size
from pyspark.sql.types import StringType

### Data cleaning

In [22]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://usc-bootcamp-yelpreview-text-analysis.s3.us-east-2.amazonaws.com/reviews.csv"
spark.sparkContext.addFile(url)
raw_df = spark.read.csv(SparkFiles.get("reviews.csv"), sep=",", header=True)

# Show DataFrame
raw_df.show()

+--------------------+-------------+------------+
|             reviews|       rating|review_count|
+--------------------+-------------+------------+
|Panda Express was...|5 star rating|          63|
|The dude and I ca...|5 star rating|          63|
|I ordered 5 total...|1 star rating|          63|
|I always order Pa...|3 star rating|          63|
|Decided to try Pa...|5 star rating|          63|
|I've never had a ...|4 star rating|          63|
|The family meal d...|1 star rating|          63|
|Quality has sever...|2 star rating|          63|
|Paid for a bowl a...|1 star rating|          63|
|Order a bowl with...|1 star rating|          63|
|Went through the ...|1 star rating|          63|
|When I think of p...|2 star rating|          63|
|Horrible is a und...|1 star rating|          63|
|Yes the drive thr...|2 star rating|          63|
|Okay..so Panda is...|4 star rating|          63|
|Going through Dri...|2 star rating|          63|
|My entrees were a...|2 star rating|          63|


In [23]:
# new column function - reduce dimension of rating column into 3 categories
def rating_category(rating:str)->str:
  """create new column for label
  """
  if rating in ["1 star rating"]:
      return "bad"
  elif rating in ["2 star rating", "3 star rating"]:
      return "descent"
  else: 
      return "good"

assert rating_category("1 star rating")=="bad"


In [24]:
# Store a user defined function
convert_rating = udf(rating_category, StringType())
convert_rating

<function __main__.rating_category>

In [25]:
# add new column
selected_df = raw_df.withColumn("output_label", convert_rating(col("rating")))
selected_df = selected_df.withColumn("length", length(selected_df["reviews"]))
selected_df.show()

+--------------------+-------------+------------+------------+------+
|             reviews|       rating|review_count|output_label|length|
+--------------------+-------------+------------+------------+------+
|Panda Express was...|5 star rating|          63|        good|   334|
|The dude and I ca...|5 star rating|          63|        good|   770|
|I ordered 5 total...|1 star rating|          63|         bad|   151|
|I always order Pa...|3 star rating|          63|     descent|   628|
|Decided to try Pa...|5 star rating|          63|        good|   261|
|I've never had a ...|4 star rating|          63|        good|   640|
|The family meal d...|1 star rating|          63|         bad|   129|
|Quality has sever...|2 star rating|          63|     descent|   350|
|Paid for a bowl a...|1 star rating|          63|         bad|   158|
|Order a bowl with...|1 star rating|          63|         bad|   151|
|Went through the ...|1 star rating|          63|         bad|   675|
|When I think of p..

### Feature Transformation

In [26]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer


In [27]:
# create token and remove stop words in order to find out stopwords percentage
tokenizer = Tokenizer(inputCol="reviews", outputCol="token")
selected_df = tokenizer.transform(selected_df)
stop_word_remover = StopWordsRemover(inputCol="token", outputCol="filtered_token")
selected_df = stop_word_remover.transform(selected_df)
selected_df.show()

+--------------------+-------------+------------+------------+------+--------------------+--------------------+
|             reviews|       rating|review_count|output_label|length|               token|      filtered_token|
+--------------------+-------------+------------+------------+------+--------------------+--------------------+
|Panda Express was...|5 star rating|          63|        good|   334|[panda, express, ...|[panda, express, ...|
|The dude and I ca...|5 star rating|          63|        good|   770|[the, dude, and, ...|[dude, came, pand...|
|I ordered 5 total...|1 star rating|          63|         bad|   151|[i, ordered, 5, t...|[ordered, 5, tota...|
|I always order Pa...|3 star rating|          63|     descent|   628|[i, always, order...|[always, order, p...|
|Decided to try Pa...|5 star rating|          63|        good|   261|[decided, to, try...|[decided, try, pa...|
|I've never had a ...|4 star rating|          63|        good|   640|[i've, never, had...|[never, bad, e

In [28]:
selected_df = selected_df.withColumn("stopwords_count", size(selected_df["filtered_token"]))
selected_df = selected_df.withColumn("stopwords_percent", selected_df["stopwords_count"]/selected_df["length"])
selected_df.show()

+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+
|             reviews|       rating|review_count|output_label|length|               token|      filtered_token|stopwords_count|  stopwords_percent|
+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+
|Panda Express was...|5 star rating|          63|        good|   334|[panda, express, ...|[panda, express, ...|             37|0.11077844311377245|
|The dude and I ca...|5 star rating|          63|        good|   770|[the, dude, and, ...|[dude, came, pand...|             80| 0.1038961038961039|
|I ordered 5 total...|1 star rating|          63|         bad|   151|[i, ordered, 5, t...|[ordered, 5, tota...|             22| 0.1456953642384106|
|I always order Pa...|3 star rating|          63|     descent|   628|[i, always, order...|[always, order, p...| 

In [29]:
# create all features to the dataset
label_encoder = StringIndexer(inputCol="output_label", outputCol="label")
hasher = HashingTF(inputCol="filtered_token", outputCol="hashed_token")
idf = IDF(inputCol="hashed_token", outputCol="idf_token")


In [30]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
vectorizer = VectorAssembler(inputCols = ["idf_token", "length","stopwords_percent"], outputCol = "features")


### Create a Pipeline to Automate The Data Transformations

In [13]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[label_encoder, hasher, idf, vectorizer])


In [14]:
# fit and transform data with pipeline
pipeline_model = pipeline.fit(selected_df)
cleaned_df = pipeline_model.transform(selected_df)
cleaned_df.show()

+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+-----+--------------------+--------------------+--------------------+
|             reviews|       rating|review_count|output_label|length|               token|      filtered_token|stopwords_count|  stopwords_percent|label|        hashed_token|           idf_token|            features|
+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+-----+--------------------+--------------------+--------------------+
|Panda Express was...|5 star rating|          63|        good|   334|[panda, express, ...|[panda, express, ...|             37|0.11077844311377245|  1.0|(262144,[2711,610...|(262144,[2711,610...|(262146,[2711,610...|
|The dude and I ca...|5 star rating|          63|        good|   770|[the, dude, and, ...|[dude, came, pand...|             80| 0.10

### Create training and testing dataset

In [15]:
from pyspark.ml.classification import NaiveBayes

# Break data down into a training set and a testing set
training, testing = cleaned_df.randomSplit([0.7, 0.3], seed = 43)

### Fit and predict NaiveBaye model

In [16]:
# Create a Naive Bayes model and fit training data
model = NaiveBayes()
predictor = model.fit(training)


In [17]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|             reviews|       rating|review_count|output_label|length|               token|      filtered_token|stopwords_count|  stopwords_percent|label|        hashed_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|"""I never want t...|2 star rating|          29|     descent|  1410|["""i, never, wan...|["""i, never, wan...|            141|                0.1|  2.0|(262144,[14,4200,...|(262144,[14,4200

In [18]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.671084


In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
# Create (prediction, label) pairs
predictionAndLabel = test_results.select("prediction", "label").rdd

# Generate confusion matrix
metrics = MulticlassMetrics(predictionAndLabel)
print(metrics.confusionMatrix())

DenseMatrix([[2067.,   46.,  286.],
             [ 219., 1115.,  259.],
             [ 702.,  179.,  462.]])


### Fit and predict with RandomForest Model

In [19]:
from pyspark.ml.classification import RandomForestClassifier


In [20]:
# create randomforest model and fit into training dataset
rf_model = RandomForestClassifier()
rf_predictor = rf_model.fit(training)


In [21]:
# transform the model with testing data 
# Tranform the model with the testing data
test_results = rf_predictor.transform(testing)
test_results.show(5)



+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|             reviews|       rating|review_count|output_label|length|               token|      filtered_token|stopwords_count|  stopwords_percent|label|        hashed_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------------------+-------------+------------+------------+------+--------------------+--------------------+---------------+-------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|"""I never want t...|2 star rating|          29|     descent|  1410|["""i, never, wan...|["""i, never, wan...|            141|                0.1|  2.0|(262144,[14,4200,...|(262144,[14,4200

In [22]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
rf_acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % rf_acc)

Accuracy of model at predicting reviews was: 0.280607


In [1]:
from pyspark.mllib.evaluation import MulticlassMetrics
# Create (prediction, label) pairs
predictionAndLabel = test_results.select("prediction", "label").rdd

# Generate confusion matrix
metrics = MulticlassMetrics(predictionAndLabel)
print(metrics.confusionMatrix())

ModuleNotFoundError: ignored

### Sentimental Analysis with RNN

In [57]:
import pandas as pd
import numpy as np

In [88]:
reviews = pd.read_csv(url)
reviews = reviews.iloc[:,0:2]
reviews.head()

Unnamed: 0,reviews,rating
0,Panda Express was on point tonight! I ordered ...,5 star rating
1,The dude and I came to this Panda Express arou...,5 star rating
2,I ordered 5 total plates fried rice chow mai...,1 star rating
3,I always order Panda Express from here and the...,3 star rating
4,Decided to try Panda Expess one more time.Corp...,5 star rating


### Clean up data, Changing rating column

In [89]:
# use defined function above to re-create rating column
reviews["rating"] = reviews["rating"].apply(rating_category)
reviews.head()

Unnamed: 0,reviews,rating
0,Panda Express was on point tonight! I ordered ...,good
1,The dude and I came to this Panda Express arou...,good
2,I ordered 5 total plates fried rice chow mai...,bad
3,I always order Panda Express from here and the...,descent
4,Decided to try Panda Expess one more time.Corp...,good


### Feature Engineer

In [90]:
# create word count, and character count
reviews["word_list"] = reviews["reviews"].apply(lambda x: x.split())
reviews["word_count"] = reviews["word_list"].apply(lambda x: len(x))
reviews["char_count"] = reviews["reviews"].apply(lambda x: len(x))
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261


In [91]:
# look at average length of each word in each review
def average_word_length(word_list)->int:
    """calculate the average word length in each review
    """
    word_length = []
    for word in word_list: 
        word_length.append(len(word))
    return np.mean(word_length)

assert average_word_length(["test", "test12"])==5


In [92]:
reviews["average_word_length"] = reviews["word_list"].apply(average_word_length)
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count,average_word_length
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334,4.253968
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770,4.167785
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151,4.357143
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628,4.155738
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261,5.390244


In [93]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [94]:
# remove stop words, and get stopwords percentage
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

reviews["stop_word_count"] = reviews["word_list"].apply(lambda x: len([word for word in x if word.lower() in stop_words]))
reviews["stop_word_percent"] = reviews["stop_word_count"]/reviews["word_count"]
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count,average_word_length,stop_word_count,stop_word_percent
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334,4.253968,28,0.444444
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770,4.167785,69,0.463087
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151,4.357143,8,0.285714
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628,4.155738,59,0.483607
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261,5.390244,8,0.195122


### Data Cleaning

In [95]:
# lower case all the reviews, and remove all the punctuations
reviews["lowercase_reviews"] = reviews["reviews"].apply(lambda x: " ".join(word.lower() for word in x.split()))
reviews["lowercase_reviews"] = reviews["lowercase_reviews"].str.replace('[^\w\s]',"")
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count,average_word_length,stop_word_count,stop_word_percent,lowercase_reviews
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334,4.253968,28,0.444444,panda express was on point tonight i ordered t...
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770,4.167785,69,0.463087,the dude and i came to this panda express arou...
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151,4.357143,8,0.285714,i ordered 5 total plates fried rice chow main ...
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628,4.155738,59,0.483607,i always order panda express from here and the...
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261,5.390244,8,0.195122,decided to try panda expess one more timecorpo...


In [96]:
# remove stop words
reviews["clean_reviews"] = reviews["lowercase_reviews"].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count,average_word_length,stop_word_count,stop_word_percent,lowercase_reviews,clean_reviews
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334,4.253968,28,0.444444,panda express was on point tonight i ordered t...,panda express point tonight ordered mobile app...
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770,4.167785,69,0.463087,the dude and i came to this panda express arou...,dude came panda express around end july 2020 h...
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151,4.357143,8,0.285714,i ordered 5 total plates fried rice chow main ...,ordered 5 total plates fried rice chow main do...
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628,4.155738,59,0.483607,i always order panda express from here and the...,always order panda express problem ever place ...
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261,5.390244,8,0.195122,decided to try panda expess one more timecorpo...,decided try panda expess one timecorporations ...


In [97]:
# find out the frequency of words, and remove nonsense words
pd.Series(" ".join(reviews["clean_reviews"]).split()).value_counts()[:60]

food          18670
panda         10282
chicken        9602
order          8020
time           6692
express        6561
get            6174
like           5817
service        5807
one            5682
good           5472
location       5390
rice           4913
go             4619
place          4271
got            4220
back           4102
always         4042
dont           3947
would          3922
orange         3713
ordered        3615
even           3272
minutes        3202
wait           3097
never          3039
customer       3030
didnt          2999
im             2988
drive          2867
line           2754
really         2722
went           2719
people         2719
said           2699
fresh          2668
asked          2665
great          2653
beef           2649
staff          2556
chinese        2548
give           2488
fast           2478
ive            2418
chow           2246
also           2226
told           2181
eat            2169
come           2151
mein           2124


In [98]:
other_stop_words = ["food", "panda", "got", "im","ive","come", "restaurant", "express"]
reviews["clean_reviews"] = reviews["clean_reviews"].apply(lambda x: " ".join(word for word in x.split() if word not in other_stop_words))
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count,average_word_length,stop_word_count,stop_word_percent,lowercase_reviews,clean_reviews
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334,4.253968,28,0.444444,panda express was on point tonight i ordered t...,point tonight ordered mobile app picked order ...
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770,4.167785,69,0.463087,the dude and i came to this panda express arou...,dude came around end july 2020 havent use goin...
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151,4.357143,8,0.285714,i ordered 5 total plates fried rice chow main ...,ordered 5 total plates fried rice chow main do...
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628,4.155738,59,0.483607,i always order panda express from here and the...,always order problem ever place order plate sm...
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261,5.390244,8,0.195122,decided to try panda expess one more timecorpo...,decided try expess one timecorporations sent c...


In [110]:
# get final dataset
final_dataset = reviews.iloc[:,[1,3,4,5,7,9]]
final_dataset.head()

Unnamed: 0,rating,word_count,char_count,average_word_length,stop_word_percent,clean_reviews
0,good,63,334,4.253968,0.444444,point tonight ordered mobile app picked order ...
1,good,149,770,4.167785,0.463087,dude came around end july 2020 havent use goin...
2,bad,28,151,4.357143,0.285714,ordered 5 total plates fried rice chow main do...
3,descent,122,628,4.155738,0.483607,always order problem ever place order plate sm...
4,good,41,261,5.390244,0.195122,decided try expess one timecorporations sent c...


### Creating Training and Testing dataset

In [116]:
# separate train, test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_dataset["clean_reviews"], final_dataset["rating"], test_size = 0.3, random_state = 42)
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")


Shape of X_train: (12334,)
Shape of X_test: (5286,)
Shape of y_train: (12334,)
Shape of y_test: (5286,)


In [117]:
# tokenize the data
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = 1000, lower= True)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index)+1

In [118]:
X_train[0]

[354, 340, 226, 51, 10, 186, 180, 200, 42, 9]

In [122]:
# create padding
from keras_preprocessing.sequence import pad_sequences
maxlen = max(final_dataset["clean_reviews"].apply(lambda x: len(x.split())))

X_train = pad_sequences(X_train, padding="post", maxlen=maxlen)
X_test = pad_sequences(X_test, padding="post", maxlen=maxlen)


In [124]:
# reshape the data
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, SimpleRNN
from keras import optimizers
from keras.preprocessing.sequence import pad_sequences

X_train = np.array(X_train).reshape((X_train.shape[0], X_train.shape[1],1))
X_test = np.array(X_test).reshape((X_test.shape[0], X_test.shape[1],1))
print(X_train.shape)
print(X_test.shape)



(12334, 423, 1)
(5286, 423, 1)


### Fit the RNN model

In [125]:
num_classes = 3

def rnn_model():
  model = Sequential()
  model.add(SimpleRNN(50, input_shape = (maxlen, 1), return_sequences=False))
  model.add(Dense(num_classes))
  model.add(Activation("softmax"))
  model.summary()

  adam = optimizers.Adam(lr = 0.001)
  model.compile(loss = "categorical_crossentropy", optimizer = adam, metrics = ["accuracy"])
  return model

In [127]:
from keras.wrappers.scikit_learn import KerasClassifier
model = KerasClassifier(build_fn = rnn_model, epochs = 20, batch_size = 50)
model.fit(X_train, y_train)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn (SimpleRNN)       (None, 50)                2600      
_________________________________________________________________
dense (Dense)                (None, 3)                 153       
_________________________________________________________________
activation (Activation)      (None, 3)                 0         
Total params: 2,753
Trainable params: 2,753
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fe67a716630>