<h3>Data preprocessing</h3>

In [1]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', 
                                                                  inferschema='true', 
                                                                  quote = '"', 
                                                                  escape = '"',
                                                                 multiline = 'true',
                                                                 ignoreTrailingWhiteSpace = 'true').load('Data\\data.csv')

# There were some problems reading the data, here I found the solutions
# https://stackoverflow.com/questions/40413526/reading-csv-files-with-quoted-fields-containing-embedded-commas
#https://stackoverflow.com/questions/50477857/spark-fails-to-read-csv-when-last-column-name-contains-spaces

In [2]:
#drop_list = ['Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
#data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)

+---+--------------------+--------------------+--------------+----------+--------------+----------+--------------------+------------+
|  X|          book_title|        review_title|   review_user|   book_id|     review_id| timestamp|         review_text|review_score|
+---+--------------------+--------------------+--------------+----------+--------------+----------+--------------------+------------+
|  1|A Gentleman in Mo...|Russian aristocra...|    Kansabelle|0143110438|R2UFCQ9WES7VFH|1555241537|A great read. In ...|           4|
|  2|A Gentleman in Mo...|Knowing nothing a...|  D.P. McHenry|0143110438|R24B1HA9J9I99G|1555241542|Great story, well...|           4|
|  3|Pet Sematary: A N...|One of King's fin...|Gordon Hoffman|198211598X|R1P137WFADSBYR|1555241649|Only the second n...|           4|
|  4|Less (Winner of t...|     Not my favorite|     R. Zocher|0316316121|R35533AKR5CBNS|1555242044|This book is t wh...|           4|
|  5|         Supermarket|      AMAZING, BOBBY|    D. Mahoney|

In [3]:
data.printSchema()

root
 |-- X: integer (nullable = true)
 |-- book_title: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_user: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- review_score: integer (nullable = true)



In [4]:
from pyspark.sql.functions import col

data.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------------+-----+
|review_score|count|
+------------+-----+
|           5| 3236|
|           4|  598|
|           3|  187|
|           1|  110|
|           2|  108|
+------------+-----+



In [5]:
data.count()

4239

In [6]:
#data.filter("review_score is NULL").show() # No nulls anymore
#data.filter("X is NULL").show() 
#data.filter("book_title is NULL").show()
#data.filter("review_title is NULL").show()
#data.filter("review_user is NULL").show()
#data.filter("book_id is NULL").show()
#data.filter("review_id is NULL").show()
#data.filter("timestamp is NULL").show()
data.filter("review_text is NULL").show() # There is one null
#data.filter("review_score is NULL").show()

+----+--------------------+--------------------+-----------+----------+--------------+----------+-----------+------------+
|   X|          book_title|        review_title|review_user|   book_id|     review_id| timestamp|review_text|review_score|
+----+--------------------+--------------------+-----------+----------+--------------+----------+-----------+------------+
|1121|Lies My Doctor To...|This book gave a ...|   mawshell|162860378X|R3SIH2LVO3EYMH|1555252626|       null|           5|
+----+--------------------+--------------------+-----------+----------+--------------+----------+-----------+------------+



In [7]:
# Remove observation where review_text is null
data = data.na.drop(subset=["review_text"])

In [8]:
data.count() # Removed!

4238

In [9]:
# Concatenate book_title, review_title and review_text into a single column
from pyspark.sql import functions as ff
data = data.withColumn('text', ff.concat(ff.col('book_title'),
                                              ff.lit(' '), 
                                              ff.col('review_title'),
                                             ff.lit(' '),
                                             ff.col('review_text')))
data.show(2, truncate = False)

+---+------------------------------+-----------------------------------------------------------------------------------------------+------------+----------+--------------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|X  |book_title                    |review_title                                                                                   |review_user |book_id   |review_id     |timestamp |review_text                                                                       

In [10]:
drop_list = ['X', 'book_title', 'review_title', 'review_user', 'book_id', 'review_id', 
             'timestamp', 'review_text']
data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)

+------------+--------------------+
|review_score|                text|
+------------+--------------------+
|           4|A Gentleman in Mo...|
|           4|A Gentleman in Mo...|
|           4|Pet Sematary: A N...|
|           4|Less (Winner of t...|
|           4|Supermarket AMAZI...|
+------------+--------------------+
only showing top 5 rows



<h3>Now the modelling pipeline starts</h3>
I got it from: https://towardsdatascience.com/multi-class-text-classification-with-pyspark-7d78d022ed35

In [11]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
import nltk
#nltk.download("stopwords")

In [12]:
# regular expression tokenizer: To split sentences into words
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

In [13]:
# stop words
stopwordList = nltk.corpus.stopwords.words('english')
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stopwordList)

In [14]:
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=20000, minDF=50)

In [15]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

label_stringIdx = StringIndexer(inputCol = "review_score", outputCol = "label") # Recoding target variable
labels_stars = label_stringIdx.fit(trainingData).labels # Save this levels to be able later to transform back

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

In [16]:
# set seed for reproducibility
(trainingData, testData) = data.randomSplit([0.8, 0.2], seed = 12345)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 3333
Test Dataset Count: 905


In [17]:
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(trainingData)
dataset = pipelineFit.transform(trainingData)
#dataset.show(1, truncate = False)
dataset.show(5)

+------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(453,[0,1,3,5,6,1...|  4.0|
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(453,[1,2,3,5,6,8...|  4.0|
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(453,[0,1,3,5,6,3...|  4.0|
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(453,[0,1,4,5,6,1...|  4.0|
|           1|After (The After ...|[after, the, afte...|[series, eye, rol...|(453,[0,3,4,7,11,...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [21]:
test_dataset = pipelineFit.transform(testData)
#test_dataset.show(1, truncate = False)
test_dataset.show(5)

+------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(453,[1,2,3,4,5,6...|  4.0|
|           1|After (The After ...|[after, the, afte...|[series, baid, sw...|(453,[1,27,94,260...|  4.0|
|           1|After (The After ...|[after, the, afte...|[series, waste, t...|(453,[0,9,14,15,4...|  4.0|
|           1|Cemetery Road: A ...|[cemetery, road, ...|[cemetery, road, ...|(453,[0,1,15,131,...|  4.0|
|           1|Cemetery Road: A ...|[cemetery, road, ...|[cemetery, road, ...|(453,[1,14,32,58,...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [18]:
# Original encoding
trainingData.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show() 

+------------+-----+
|review_score|count|
+------------+-----+
|           5| 2543|
|           4|  466|
|           3|  146|
|           2|   90|
|           1|   88|
+------------+-----+



In [19]:
# Show how the encoding changed
dataset.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show() 

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 2543|
|  1.0|  466|
|  2.0|  146|
|  3.0|   90|
|  4.0|   88|
+-----+-----+



<h3>Logistic regression using count vector features</h3>

In [22]:
lr = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)
lrModel = lr.fit(dataset)
predictions = lrModel.transform(test_dataset)
predictions.filter(predictions['prediction'] == 0) \
    .select("text","review_score","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------+------------------------------+-----+----------+
|                          text|review_score|                   probability|label|prediction|
+------------------------------+------------+------------------------------+-----+----------+
|The Woman in the Window: A ...|           4|[1.0,2.17284525898869E-18,4...|  1.0|       0.0|
|First: Sandra Day O'Connor ...|           5|[0.9999999999999467,5.31921...|  0.0|       0.0|
|QAnon: An Invitation to The...|           5|[0.9999999999986744,1.13697...|  0.0|       0.0|
|Someone Knows Another great...|           5|[0.9999999999485647,5.14352...|  0.0|       0.0|
|The Red Scrolls of Magic (T...|           3|[0.999999999414569,4.332639...|  2.0|       0.0|
|The Mister Creating your ow...|           5|[0.9999999637776866,3.61965...|  0.0|       0.0|
|The Woman in the Window: A ...|           5|[0.9999999544060393,5.58719...|  0.0|       0.0|
|Pet Sematary: A Novel Facin...|           5|[0.999999952776

In [24]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions) # This is the accuracy

0.7033916817155373

In [53]:
from pyspark.ml.feature import IndexToString
# Saving levels 
# Transform back from index to original coding
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedScore", labels = labels_stars)
predictions = labelConverter.transform(predictions)

In [54]:
predictions.head(1)

[Row(review_score=1, text="A Gentleman in Moscow: A Novel Too slow This has to be one of the most boring books I've read.  It takes chapters upon chapters to move the story forward.  I stopped reading after then seventh chapter", words=['a', 'gentleman', 'in', 'moscow', 'a', 'novel', 'too', 'slow', 'this', 'has', 'to', 'be', 'one', 'of', 'the', 'most', 'boring', 'books', 'i', 've', 'read', 'it', 'takes', 'chapters', 'upon', 'chapters', 'to', 'move', 'the', 'story', 'forward', 'i', 'stopped', 'reading', 'after', 'then', 'seventh', 'chapter'], filtered=['gentleman', 'moscow', 'novel', 'slow', 'one', 'boring', 'books', 'read', 'takes', 'chapters', 'upon', 'chapters', 'move', 'story', 'forward', 'stopped', 'reading', 'seventh', 'chapter'], features=SparseVector(453, {1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 15: 1.0, 19: 1.0, 135: 1.0, 197: 1.0, 199: 1.0, 223: 1.0, 328: 2.0, 379: 1.0}), label=4.0, rawPrediction=DenseVector([0.6379, 0.0223, -0.0876, 1.61, -2.1826]), probability=DenseV

<h4> Crossvalidation </h4>

In [25]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

lr = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel = cv.fit(dataset)

predictions = cvModel.transform(test_dataset)
predictions = labelConverter.transform(predictions) # Transform labels

# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
evaluator.evaluate(predictions)

0.7261842626917203

In [41]:
# To evaluate and get confusion matrix: https://runawayhorse001.github.io/LearningApacheSpark/classification.html
#from pyspark.mllib.evaluation import MulticlassMetrics
#metrics = MulticlassMetrics(predictionCol="predictions", labelCol="label")
#metrics.evaluate(predictions)

<h3>Logistic Regression using TF-IDF Features</h3>

In [48]:
from pyspark.ml.feature import HashingTF, IDF

In [49]:
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline_tfidf = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

In [50]:
# applying pipeline to training data
pipelineFit_tfidf = pipeline_tfidf.fit(trainingData)
dataset_tfidf = pipelineFit_tfidf.transform(trainingData)
dataset_tfidf.show(1)

+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|         rawFeatures|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(20000,[702,1106,...|(20000,[702,1106,...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row



In [51]:
test_dataset_tfidf = pipelineFit_tfidf.transform(testData)
test_dataset_tfidf.show(1)

+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|         rawFeatures|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(20000,[415,591,2...|(20000,[415,591,2...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row



In [55]:
lr_tfidf = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)
lrModel_tfidf = lr_tfidf.fit(dataset_tfidf)
predictions_tfidf = lrModel_tfidf.transform(test_dataset_tfidf)
predictions_tfidf = labelConverter.transform(predictions_tfidf) # Transform labels
predictions_tfidf.filter(predictions_tfidf['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|Pet Sematary: A Novel perfe...|[1.0,1.0815308445822547E-16...|           4|             5|
|The Island of Sea Women: A ...|[1.0,8.278481062705543E-17,...|           5|             5|
|Cat and Nat's Mom Truths: E...|[1.0,3.097684754256849E-17,...|           5|             5|
|The Mister EL JAMES did it ...|[1.0,2.041465137202726E-17,...|           5|             5|
|Unlearn: 101 Simple Truths ...|[1.0,1.8135263570340965E-17...|           5|             5|
|Eat to Beat Disease: The Ne...|[1.0,3.570469271328641E-18,...|           5|             5|
|Run Away Harlan Coben at Hi...|[1.0,1.372681552457108E-18,...|           4|             5|
|Accidental Presidents: Eigh...|[1.0,1.3361246436745188E-18...|           5|    

In [56]:
predictions_tfidf.show(1, truncate = False)

+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [31]:
evaluator_tfidf = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_tfidf.evaluate(predictions_tfidf)

0.7211881145739505

<h4>Cross-validation</h4>

In [32]:
lr_tfidf = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr_tfidf, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel_tfidf = cv.fit(dataset_tfidf)

predictions_tfidf = cvModel_tfidf.transform(test_dataset_tfidf)
predictions_tfidf = labelConverter.transform(predictions_tfidf) # Transform labels

# Evaluate best model
evaluator_tfidf = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_tfidf.evaluate(predictions_tfidf)

KeyboardInterrupt: 

<h3> Word2Vec </h3>

In [29]:
from pyspark.ml.feature import Word2Vec

w2v = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="features")


In [30]:
pipeline_w2v = Pipeline(stages=[regexTokenizer, stopwordsRemover, w2v, label_stringIdx])

In [32]:
# applying pipeline to training data
pipelineFit_w2v = pipeline_w2v.fit(trainingData)
dataset_w2v = pipelineFit_w2v.transform(trainingData)
dataset_w2v.show(1)

+------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|[0.04381944184812...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row



In [33]:
test_dataset_w2v = pipelineFit_w2v.transform(testData)
test_dataset_w2v.show(1)

+------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|[0.02492030376666...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row



In [34]:
lr_w2v = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)
lrModel_w2v = lr_w2v.fit(dataset_w2v)
predictions_w2v = lrModel_w2v.transform(test_dataset_w2v)
predictions_w2v = labelConverter.transform(predictions_w2v) # Transform labels
predictions_w2v.filter(predictions_w2v['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------+------------------------------+-----+----------+
|                          text|review_score|                   probability|label|prediction|
+------------------------------+------------+------------------------------+-----+----------+
|Lies My Doctor Told Me Seco...|           5|[0.9646835515630324,0.00309...|  0.0|       0.0|
|Lies My Doctor Told Me Seco...|           5|[0.9608090588343119,0.00303...|  0.0|       0.0|
|Lies My Doctor Told Me Seco...|           5|[0.9575811593800235,0.00420...|  0.0|       0.0|
|Lies My Doctor Told Me Seco...|           5|[0.9548220408240203,0.00547...|  0.0|       0.0|
|QAnon: An Invitation to The...|           5|[0.9542220269819321,0.01907...|  0.0|       0.0|
|Lies My Doctor Told Me Seco...|           5|[0.9465833107354823,0.00784...|  0.0|       0.0|
|Lies My Doctor Told Me Seco...|           5|[0.942610538423865,0.010679...|  0.0|       0.0|
|Lies My Doctor Told Me Seco...|           4|[0.941303421943

In [35]:
predictions_w2v.show(1, truncate = False)

+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------+-----+----------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------+----------+
|review_score|text                                                                                                  

In [36]:
evaluator_w2v = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_w2v.evaluate(predictions_w2v)

0.670660555402759

<h4>Cross-validation</h4>

In [32]:
lr_w2v = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr_w2v, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel_w2v = cv.fit(dataset_w2v)

predictions_w2v = cvModel_w2v.transform(test_dataset_w2v)
predictions_w2v = labelConverter.transform(predictions_w2v) # Transform labels

# Evaluate best model
evaluator_w2v = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_w2v.evaluate(predictions_w2v)

KeyboardInterrupt: 

<h3> Naive Bayes</h3>

In [45]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(dataset)
predictions = model.transform(test_dataset)
predictions = labelConverter.transform(predictions) # Transform labels
predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------+------------------------------+-----+----------+
|                          text|review_score|                   probability|label|prediction|
+------------------------------+------------+------------------------------+-----+----------+
|A Gentleman in Moscow: A No...|           5|[1.0,9.770840117995038E-17,...|  0.0|       0.0|
|A Gentleman in Moscow: A No...|           5|[1.0,8.261090898410058E-17,...|  0.0|       0.0|
|Lies My Doctor Told Me Seco...|           5|[1.0,5.948914789946586E-17,...|  0.0|       0.0|
|Less (Winner of the Pulitze...|           5|[1.0,5.816130134888463E-17,...|  0.0|       0.0|
|A Gentleman in Moscow: A No...|           5|[1.0,5.81060272265125E-17,5...|  0.0|       0.0|
|Can't Make This Stuff Up!: ...|           5|[1.0,5.635006428149414E-17,...|  0.0|       0.0|
|Pet Sematary: A Novel Great...|           4|[1.0,5.348160890282632E-17,...|  1.0|       0.0|
|Pet Sematary: A Novel Shine...|           5|[1.0,4.55505475

In [172]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7299769820931572

<h3> Naive Bayes TF-IDF</h3>

In [45]:

nb_tfidf = NaiveBayes(smoothing=1)
model_tfidf = nb_tfidf.fit(dataset_tfidf)
predictions_tfidf = model_tfidf.transform(test_dataset_tfidf)
predictions_tfidf = labelConverter.transform(predictions_tfidf) # Transform labels
predictions_tfidf.filter(predictions_tfidf['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------+------------------------------+-----+----------+
|                          text|review_score|                   probability|label|prediction|
+------------------------------+------------+------------------------------+-----+----------+
|A Gentleman in Moscow: A No...|           5|[1.0,9.770840117995038E-17,...|  0.0|       0.0|
|A Gentleman in Moscow: A No...|           5|[1.0,8.261090898410058E-17,...|  0.0|       0.0|
|Lies My Doctor Told Me Seco...|           5|[1.0,5.948914789946586E-17,...|  0.0|       0.0|
|Less (Winner of the Pulitze...|           5|[1.0,5.816130134888463E-17,...|  0.0|       0.0|
|A Gentleman in Moscow: A No...|           5|[1.0,5.81060272265125E-17,5...|  0.0|       0.0|
|Can't Make This Stuff Up!: ...|           5|[1.0,5.635006428149414E-17,...|  0.0|       0.0|
|Pet Sematary: A Novel Great...|           4|[1.0,5.348160890282632E-17,...|  1.0|       0.0|
|Pet Sematary: A Novel Shine...|           5|[1.0,4.55505475

In [172]:
evaluator_tfidf = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_tfidf.evaluate(predictions_tfidf)

0.7299769820931572

<h3>Random Forest</h3>

In [174]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("review_text","review_score","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------+------------------------------+-----+----------+
|                   review_text|review_score|                   probability|label|prediction|
+------------------------------+------------+------------------------------+-----+----------+
|Loved this story. Beautiful...|           4|[0.7836402391282297,0.12735...|  1.0|       0.0|
|Very informative about Keto...|           5|[0.7836402391282297,0.12735...|  0.0|       0.0|
|One of the most enjoyable b...|           5|[0.7836402391282297,0.12735...|  0.0|       0.0|
|                Wonderful book|           4|[0.7836402391282297,0.12735...|  1.0|       0.0|
|Love the easy to follow roa...|           5|[0.7830368095617698,0.12784...|  0.0|       0.0|
|Poignant and very satisfyin...|           5|[0.7828176595848821,0.12763...|  0.0|       0.0|
|Wonderful book. Beautiful p...|           5|[0.7827614778707869,0.12824...|  0.0|       0.0|
|Well written,wonderful char...|           5|[0.782761477870

In [175]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6777035552331478