<h3>Data preprocessing</h3>

In [1]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', 
                                                                  inferschema='true', 
                                                                  quote = '"', 
                                                                  escape = '"',
                                                                 multiline = 'true',
                                                                 ignoreTrailingWhiteSpace = 'true').load('Data/data.csv')

# There were some problems reading the data, here I found the solutions
# https://stackoverflow.com/questions/40413526/reading-csv-files-with-quoted-fields-containing-embedded-commas
#https://stackoverflow.com/questions/50477857/spark-fails-to-read-csv-when-last-column-name-contains-spaces

In [None]:
#read data frame in pandas

import pandas as pd
odf = pd.read_csv('/Users/Gianina/Documents/MSc/spark/notebooks/data.csv', encoding = 'unicode_escape')
odf.head()

In [None]:
#merge review title and text into a single column in pandas
odf['text'] = odf.agg('{0[review_title]}  {0[review_text]}'.format, axis=1)
odf.head()

In [3]:
#drop_list = ['Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
#data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)

+---+--------------------+--------------------+--------------+----------+--------------+----------+--------------------+------------+
|  X|          book_title|        review_title|   review_user|   book_id|     review_id| timestamp|         review_text|review_score|
+---+--------------------+--------------------+--------------+----------+--------------+----------+--------------------+------------+
|  1|A Gentleman in Mo...|Russian aristocra...|    Kansabelle|0143110438|R2UFCQ9WES7VFH|1555241537|A great read. In ...|           4|
|  2|A Gentleman in Mo...|Knowing nothing a...|  D.P. McHenry|0143110438|R24B1HA9J9I99G|1555241542|Great story, well...|           4|
|  3|Pet Sematary: A N...|One of King's fin...|Gordon Hoffman|198211598X|R1P137WFADSBYR|1555241649|Only the second n...|           4|
|  4|Less (Winner of t...|     Not my favorite|     R. Zocher|0316316121|R35533AKR5CBNS|1555242044|This book is t wh...|           4|
|  5|         Supermarket|      AMAZING, BOBBY|    D. Mahoney|

In [4]:
data.printSchema()

root
 |-- X: integer (nullable = true)
 |-- book_title: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_user: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- review_score: integer (nullable = true)



In [5]:
from pyspark.sql.functions import col

data.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------------+-----+
|review_score|count|
+------------+-----+
|           5| 3236|
|           4|  598|
|           3|  187|
|           1|  110|
|           2|  108|
+------------+-----+



In [6]:
data.count()

4239

In [7]:
#data.filter("review_score is NULL").show() # No nulls anymore
#data.filter("X is NULL").show() 
#data.filter("book_title is NULL").show()
#data.filter("review_title is NULL").show()
#data.filter("review_user is NULL").show()
#data.filter("book_id is NULL").show()
#data.filter("review_id is NULL").show()
#data.filter("timestamp is NULL").show()
data.filter("review_text is NULL").show() # There is one null
#data.filter("review_score is NULL").show()

+----+--------------------+--------------------+-----------+----------+--------------+----------+-----------+------------+
|   X|          book_title|        review_title|review_user|   book_id|     review_id| timestamp|review_text|review_score|
+----+--------------------+--------------------+-----------+----------+--------------+----------+-----------+------------+
|1121|Lies My Doctor To...|This book gave a ...|   mawshell|162860378X|R3SIH2LVO3EYMH|1555252626|       null|           5|
+----+--------------------+--------------------+-----------+----------+--------------+----------+-----------+------------+



In [8]:
# Remove observation where review_text is null
data = data.na.drop(subset=["review_text"])

In [9]:
data.count() # Removed!

4238

In [10]:
# Concatenate book_title, review_title and review_text into a single column
from pyspark.sql import functions as ff
data = data.withColumn('text', ff.concat(ff.col('review_title'), ff.lit(' '), ff.col('review_text'))) #removed book_title
data.show(1, truncate = False)

+---+------------------------------+----------------------------------------------------+-----------+----------+--------------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|X  |book_title                    |review_title                                        |review_user|book_id   |review_id     |timestamp |review_text                                                                                                                                                                                              

In [11]:
drop_list = ['X', 'book_title', 'review_title', 'review_user', 'book_id', 'review_id', 
             'timestamp', 'review_text']
data = data.select([column for column in data.columns if column not in drop_list])
data.show(20, )

+------------+--------------------+
|review_score|                text|
+------------+--------------------+
|           4|Russian aristocra...|
|           4|Knowing nothing a...|
|           4|One of King's fin...|
|           4|Not my favorite T...|
|           4|AMAZING, BOBBY As...|
|           4|A really fun read...|
|           4|So truly enjoyabl...|
|           4|Well written and ...|
|           3|Not one of his be...|
|           4|Beautiful It's no...|
|           4|The last returns ...|
|           4|Don't open this b...|
|           4|Bobby Bestseller!...|
|           4|Listen to this in...|
|           4|A book with a mes...|
|           4|Glad I read this ...|
|           4|Gentleman From Mo...|
|           4|It is a terrific ...|
|           2|An OK read if the...|
|           4|Just as scary now...|
+------------+--------------------+
only showing top 20 rows



<h3>Now the modelling pipeline starts</h3>
I got it from: https://towardsdatascience.com/multi-class-text-classification-with-pyspark-7d78d022ed35

In [12]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
import nltk
#nltk.download("stopwords")

In [13]:
# set seed for reproducibility
(trainingData, testData) = data.randomSplit([0.8, 0.2], seed = 12345)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 3333
Test Dataset Count: 905


In [14]:
# regular expression tokenizer: To split sentences into words
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

In [15]:
# stop words
stopwordList = nltk.corpus.stopwords.words('english')
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stopwordList)

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Juliana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=20000, minDF=50)

In [18]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

# Recoding target variable
label_stringIdx = StringIndexer(inputCol = "review_score", outputCol = "label") 
labels_stars = label_stringIdx.fit(trainingData).labels # Save this levels to be able later to transform back

# Create pipeline
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

In [19]:
# Fit the pipeline to training data.
pipelineFit = pipeline.fit(trainingData)
dataset = pipelineFit.transform(trainingData)
#dataset.show(1, truncate = False)
dataset.show(5)

+------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|*Eye roll* I firs...|[eye, roll, i, fi...|[eye, roll, first...|(415,[0,2,3,4,6,9...|  4.0|
|           1|1 of the normal H...|[1, of, the, norm...|[1, normal, start...|(415,[0,16,32,42,...|  4.0|
|           1|580 pages to just...|[580, pages, to, ...|[580, pages, quit...|(415,[0,1,2,3,4,5...|  4.0|
|           1|A political hit j...|[a, political, hi...|[political, hit, ...|(415,[0,22,54,227...|  4.0|
|           1|Amazing Read but ...|[amazing, read, b...|[amazing, read, d...|(415,[0,1,12,25,5...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [20]:
# Fit the pipeline to test data.
test_dataset = pipelineFit.transform(testData)
#test_dataset.show(1, truncate = False)
test_dataset.show(5)

+------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A waste of money ...|[a, waste, of, mo...|[waste, money, ti...|(415,[5,10,56,375...|  4.0|
|           1|Attn AMAZON Kindl...|[attn, amazon, ki...|[attn, amazon, ki...|(415,[0,11,116,13...|  4.0|
|           1|Baid and Switch I...|[baid, and, switc...|[baid, switch, ha...|(415,[21,23,138,2...|  4.0|
|           1|Boring ! I'm a hu...|[boring, i, m, a,...|[boring, huge, fa...|(415,[2,22,144,30...|  4.0|
|           1|Boring Great plot...|[boring, great, p...|[boring, great, p...|(415,[4,25,63,230...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [None]:
# Original encoding
trainingData.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show() 

In [None]:
# Show how the encoding changed
dataset.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show() 

<h3>Logistic regression using count vector features</h3>

In [21]:
from pyspark.ml.feature import IndexToString

# Transform back from index to original coding
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedScore", labels = labels_stars)

In [22]:
lr = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

lrModel = lr.fit(dataset) # Fit model
predictions = lrModel.transform(test_dataset) # Predict
predictions = labelConverter.transform(predictions) # Transform labels

predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 45)

+---------------------------------------------+---------------------------------------------+------------+--------------+
|                                         text|                                  probability|review_score|predictedScore|
+---------------------------------------------+---------------------------------------------+------------+--------------+
|A masterpiece of theory from from a master...|[0.9999999999999991,7.954030889775193E-16,...|           4|             5|
|A Mentally Ill Woman, Anna Fox, Witnesses ...|[0.9999999999996585,3.415128690030579E-13,...|           4|             5|
|All Stiva, No Lev On the plus side, the no...|[0.9999999999981952,1.1338152616723868E-12...|           4|             5|
|exceptional novel - one of the best this y...|[0.9999999999943094,5.690332398125168E-12,...|           5|             5|
|Another blockbuster of a historical fictio...|[0.9999999851303771,1.4869556367930017E-8,...|           5|             5|
|An engaging (exciting, 

In [23]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions) # This is the accuracy

0.7156069853194736

In [24]:
predictions.head(1)

[Row(review_score=1, text='A waste of money and time Too long... too overly dramatic.  Narrator on audio needs to tone it down down down a few notches. Like waiting for paint to dry.', words=['a', 'waste', 'of', 'money', 'and', 'time', 'too', 'long', 'too', 'overly', 'dramatic', 'narrator', 'on', 'audio', 'needs', 'to', 'tone', 'it', 'down', 'down', 'down', 'a', 'few', 'notches', 'like', 'waiting', 'for', 'paint', 'to', 'dry'], filtered=['waste', 'money', 'time', 'long', 'overly', 'dramatic', 'narrator', 'audio', 'needs', 'tone', 'notches', 'like', 'waiting', 'paint', 'dry'], features=SparseVector(415, {5: 1.0, 10: 1.0, 56: 1.0, 375: 1.0, 395: 1.0}), label=4.0, rawPrediction=DenseVector([2.1111, -1.8611, -1.7852, -0.0291, 1.5643]), probability=DenseVector([0.5762, 0.0109, 0.0117, 0.0678, 0.3335]), prediction=0.0, predictedScore='5')]

<h4> Crossvalidation </h4>

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

lr = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel = cv.fit(dataset)

predictions = cvModel.transform(test_dataset)
predictions = labelConverter.transform(predictions) # Transform labels

# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
evaluator.evaluate(predictions)

In [None]:
best_model = cvModel.bestModel
best_model

In [None]:
best_reg_param = best_model._java_obj.getRegParam()
best_elasticnet_param = best_model._java_obj.getElasticNetParam()
print(best_reg_param);print(best_elasticnet_param)

In [None]:
predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 40)

In [None]:
# To evaluate and get confusion matrix: https://runawayhorse001.github.io/LearningApacheSpark/classification.html
#from pyspark.mllib.evaluation import MulticlassMetrics
#metrics = MulticlassMetrics(predictionCol="predictions", labelCol="label")
#metrics.evaluate(predictions)

<h3> Naive Bayes</h3>

In [25]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1)

model = nb.fit(dataset)
predictions = model.transform(test_dataset)
predictions = labelConverter.transform(predictions) # Transform labels

predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 15, truncate = 40)

+----------------------------------------+----------------------------------------+------------+--------------+
|                                    text|                             probability|review_score|predictedScore|
+----------------------------------------+----------------------------------------+------------+--------------+
|The best book you will ever read! By ...|[0.9999962814618197,3.503214632379414...|           5|             5|
|What a POWERFUL story!! Unplanned is ...|[0.9999770160552786,2.298346047933018...|           5|             5|
|Beautifully written with elegant Beau...|[0.9999733020487866,2.506506401722169...|           4|             5|
|Amazing, beautiful, accessible book! ...|[0.9999701132582072,2.156537250510015...|           5|             5|
|Bravo, Mr. Towles! After reading "Rul...|[0.9999586010516772,1.892012273915955...|           5|             5|
|exceptional novel - one of the best t...|[0.9999424971293328,5.642610535653649...|           5|        

In [26]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7323989945784909

In [28]:
pipeline_nb_final = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx, nb, labelConverter])
pipeline_nb_export = pipeline_nb_final.fit(trainingData)

In [29]:
# Export pipeline
pipeline_nb_export.save('pipeline_nb')

<h3>Random Forest</h3>

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel = rf.fit(dataset)
predictions = rfModel.transform(test_dataset)
predictions = labelConverter.transform(predictions) # Transform labels

predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 15, truncate = 40)

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

<h4> Crossvalidation </h4>

In [None]:
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [100, 200, 500]) # regularization parameter
             .addGrid(rf.maxDepth, [4, 10, 20]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel = cv.fit(dataset)

predictions = cvModel.transform(test_dataset)
predictions = labelConverter.transform(predictions) # Transform labels

# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
evaluator.evaluate(predictions)

In [None]:
best_model = cvModel.bestModel
best_model

In [None]:
best_numTrees = best_model.getNumTrees
best_maxDepth = best_model.getOrDefault('maxDepth')
print(best_numTrees);print(best_maxDepth)

In [None]:
predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 15, truncate = 40)

<h3>TF-IDF Features</h3>

In [30]:
from pyspark.ml.feature import HashingTF, IDF

In [31]:
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

pipeline_tfidf = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

In [32]:
# applying pipeline to training data
pipelineFit_tfidf = pipeline_tfidf.fit(trainingData)
dataset_tfidf = pipelineFit_tfidf.transform(trainingData)
dataset_tfidf.show(1)

+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|         rawFeatures|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|*Eye roll* I firs...|[eye, roll, i, fi...|[eye, roll, first...|(20000,[456,1305,...|(20000,[456,1305,...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row



In [33]:
# applying pipeline to test data
test_dataset_tfidf = pipelineFit_tfidf.transform(testData)
test_dataset_tfidf.show(1)

+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|         rawFeatures|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A waste of money ...|[a, waste, of, mo...|[waste, money, ti...|(20000,[437,450,1...|(20000,[437,450,1...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row



<h3>Logistic regression TF-IDF</h3>

In [34]:
lr_tfidf = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

lrModel_tfidf = lr_tfidf.fit(dataset_tfidf)
predictions_tfidf = lrModel_tfidf.transform(test_dataset_tfidf)
predictions_tfidf = labelConverter.transform(predictions_tfidf) # Transform labels

predictions_tfidf.filter(predictions_tfidf['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 15, truncate = 40)

+----------------------------------------+----------------------------------------+------------+--------------+
|                                    text|                             probability|review_score|predictedScore|
+----------------------------------------+----------------------------------------+------------+--------------+
|Beautifully done and down to earth I ...|[1.0,1.0951071313748669E-16,5.0335173...|           5|             5|
|SO Twisty Twisted “I want my kids to ...|[1.0,6.689446406497144E-17,2.61317220...|           5|             5|
|Engrossing story! During the first pa...|[1.0,3.252631500218192E-17,7.17720871...|           5|             5|
|SLOW BUILD TO THE TOP OF THE ROLLER C...|[1.0,3.044315421944498E-17,3.72133604...|           5|             5|
|On the edge of my seat The author rea...|[1.0,2.4405740959251534E-17,2.8124606...|           5|             5|
|Great book! This has been the best bo...|[1.0,1.6857201275336582E-17,4.7220161...|           5|        

In [35]:
evaluator_tfidf = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_tfidf.evaluate(predictions_tfidf)

0.7240072934143862

In [36]:
predictions_tfidf.head(1)

[Row(review_score=1, text='A waste of money and time Too long... too overly dramatic.  Narrator on audio needs to tone it down down down a few notches. Like waiting for paint to dry.', words=['a', 'waste', 'of', 'money', 'and', 'time', 'too', 'long', 'too', 'overly', 'dramatic', 'narrator', 'on', 'audio', 'needs', 'to', 'tone', 'it', 'down', 'down', 'down', 'a', 'few', 'notches', 'like', 'waiting', 'for', 'paint', 'to', 'dry'], filtered=['waste', 'money', 'time', 'long', 'overly', 'dramatic', 'narrator', 'audio', 'needs', 'tone', 'notches', 'like', 'waiting', 'paint', 'dry'], rawFeatures=SparseVector(20000, {437: 1.0, 450: 1.0, 1933: 1.0, 3330: 1.0, 4511: 1.0, 5242: 1.0, 8904: 1.0, 12378: 1.0, 15666: 1.0, 15866: 1.0, 16256: 1.0, 16270: 1.0, 16303: 1.0, 18157: 1.0, 19556: 1.0}), features=SparseVector(20000, {437: 5.547, 450: 4.3507, 1933: 5.9147, 3330: 1.7335, 4511: 4.8538, 5242: 4.1607, 8904: 2.6826, 12378: 5.547, 15666: 3.9688, 15866: 4.6462, 16256: 5.1675, 16270: 0.0, 16303: 0.0, 181

<h4>Cross-validation</h4>

In [None]:
lr_tfidf = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr_tfidf.regParam, [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]) # regularization parameter
             .addGrid(lr_tfidf.elasticNetParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv_tfidf = CrossValidator(estimator=lr_tfidf, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel_tfidf = cv_tfidf.fit(dataset_tfidf)

predictions_tfidf = cvModel_tfidf.transform(test_dataset_tfidf)
predictions_tfidf = labelConverter.transform(predictions_tfidf) # Transform labels

# Evaluate best model
evaluator_tfidf = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_tfidf.evaluate(predictions_tfidf)

In [None]:
best_model_tfidf = cvModel_tfidf.bestModel
best_model_tfidf

In [None]:
best_reg_param_tfidf = best_model_tfidf._java_obj.getRegParam()
best_elasticnet_param_tfidf = best_model_tfidf._java_obj.getElasticNetParam()
print(best_reg_param_tfidf);print(best_elasticnet_param_tfidf)

In [None]:
predictions_tfidf.filter(predictions_tfidf['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 15, truncate = 40)

<h3> Naive Bayes TF-IDF</h3>

In [None]:
nb_tfidf = NaiveBayes(smoothing=1)

model_tfidf = nb_tfidf.fit(dataset_tfidf)
predictions_tfidf = model_tfidf.transform(test_dataset_tfidf)
predictions_tfidf = labelConverter.transform(predictions_tfidf) # Transform labels

predictions_tfidf.filter(predictions_tfidf['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 15, truncate = 40)

In [None]:
evaluator_tfidf = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_tfidf.evaluate(predictions_tfidf)

<h3>Random Forest TF-IDF</h3>

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rf_tfidf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel_tfidf = rf_tfidf.fit(dataset_tfidf)
predictions_tfidf = rfModel_tfidf.transform(test_dataset_tfidf)
predictions_tfidf = labelConverter.transform(predictions_tfidf) # Transform labels

predictions_tfidf.filter(predictions_tfidf['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 15, truncate = 40)

In [None]:
evaluator_tfidf = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_tfidf.evaluate(predictions_tfidf)

<h4> Crossvalidation </h4>

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

rf_tfidf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(rf_tfidf.numTrees, [100, 200, 500]) # regularization parameter
             .addGrid(rf_tfidf.maxDepth, [4, 10, 20]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv_tfidf = CrossValidator(estimator=rf_tfidf, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel_tfidf = cv_tfidf.fit(dataset_tfidf)

predictions_tfidf = cvModel_tfidf.transform(test_dataset_tfidf)
predictions_tfidf = labelConverter.transform(predictions_tfidf) # Transform labels

# Evaluate best model
evaluator_tfidf = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
evaluator_tfidf.evaluate(predictions_tfidf)

In [None]:
best_model_tfidf = cvModel_tfidf.bestModel
best_model_tfidf

In [None]:
best_numTrees_tfidf = best_model_tfidf.getNumTrees
best_maxDepth_tfidf = best_model_tfidf.getOrDefault('maxDepth')
print(best_numTrees_tfidf);print(best_maxDepth_tfidf)

In [None]:
predictions_tfidf.filter(predictions_tfidf['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 15, truncate = 40)

<h3> Word2Vec </h3>

In [None]:
from pyspark.ml.feature import Word2Vec

w2v = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="features")

In [None]:
pipeline_w2v = Pipeline(stages=[regexTokenizer, stopwordsRemover, w2v, label_stringIdx])

In [None]:
# applying pipeline to training data
pipelineFit_w2v = pipeline_w2v.fit(trainingData)
dataset_w2v = pipelineFit_w2v.transform(trainingData)
dataset_w2v.show(1)

In [None]:
# applying pipeline to test data
test_dataset_w2v = pipelineFit_w2v.transform(testData)
test_dataset_w2v.show(1)

<h3>Logistic regression Word2Vec</h3>

In [None]:
lr_w2v = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

lrModel_w2v = lr_w2v.fit(dataset_w2v)
predictions_w2v = lrModel_w2v.transform(test_dataset_w2v)
predictions_w2v = labelConverter.transform(predictions_w2v) # Transform labels

predictions_w2v.filter(predictions_w2v['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 15, truncate = 40)

In [None]:
predictions_w2v.head(1)

In [None]:
evaluator_w2v = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_w2v.evaluate(predictions_w2v)

<h4>Cross-validation</h4>

In [None]:
lr_w2v = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr_w2v.regParam, [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]) # regularization parameter
             .addGrid(lr_w2v.elasticNetParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv_w2v = CrossValidator(estimator=lr_w2v, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel_w2v = cv_w2v.fit(dataset_w2v)

predictions_w2v = cvModel_w2v.transform(test_dataset_w2v)
predictions_w2v = labelConverter.transform(predictions_w2v) # Transform labels

# Evaluate best model
evaluator_w2v = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_w2v.evaluate(predictions_w2v)

In [None]:
best_model_w2v = cvModel_w2v.bestModel
best_model_w2v

In [None]:
best_reg_param_w2v = best_model_w2v._java_obj.getRegParam()
best_elasticnet_param_w2v = best_model_w2v._java_obj.getElasticNetParam()
print(best_reg_param_w2v);print(best_elasticnet_param_w2v)

In [None]:
predictions_w2v.filter(predictions_w2v['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 15, truncate = 40)

<h3> Naive Bayes Word2Vec</h3>

In [None]:
nb_w2v = NaiveBayes(smoothing=1)

model_w2v = nb_w2v.fit(dataset_w2v)
predictions_w2v = model_w2v.transform(test_dataset_w2v)
predictions_w2v = labelConverter.transform(predictions_w2v) # Transform labels

predictions_w2v.filter(predictions_w2v['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 15, truncate = 40)

In [None]:
evaluator_w2v = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_w2v.evaluate(predictions_w2v)

<h3>Random Forest Word2Vec</h3>

In [None]:
rf_w2v = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel_w2v = rf_w2v.fit(dataset_w2v)
predictions_w2v = rfModel_w2v.transform(test_dataset_w2v)
predictions_w2v = labelConverter.transform(predictions_w2v) # Transform labels

predictions_w2v.filter(predictions_w2v['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 15, truncate = 40)

In [None]:
evaluator_w2v = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_w2v.evaluate(predictions_w2v)

<h4> Crossvalidation </h4>

In [None]:
rf_w2v = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(rf_w2v.numTrees, [100, 200, 500]) # regularization parameter
             .addGrid(rf_w2v.maxDepth, [4, 10, 20]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv_w2v = CrossValidator(estimator=rf_w2v, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel_w2v = cv_w2v.fit(dataset_w2v)

predictions_w2v = cvModel_w2v.transform(test_dataset_w2v)
predictions_w2v = labelConverter.transform(predictions_w2v) # Transform labels

# Evaluate best model
evaluator_w2v = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_w2v.evaluate(predictions_w2v)

In [None]:
best_model_w2v = cvModel_w2v.bestModel
best_model_w2v

In [None]:
best_numTrees_w2v = best_model_w2v.getNumTrees
best_maxDepth_w2v = best_model_w2v.getOrDefault('maxDepth')
print(best_numTrees_w2v);print(best_maxDepth_w2v)

In [None]:
predictions_w2v.filter(predictions_w2v['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 15, truncate = 40)

In [None]:
predictions_w2v.head(1)