<h3>Data preprocessing</h3>

In [1]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', 
                                                                  inferschema='true', 
                                                                  quote = '"', 
                                                                  escape = '"',
                                                                 multiline = 'true',
                                                                 ignoreTrailingWhiteSpace = 'true').load('Data\\data.csv')

# There were some problems reading the data, here I found the solutions
# https://stackoverflow.com/questions/40413526/reading-csv-files-with-quoted-fields-containing-embedded-commas
#https://stackoverflow.com/questions/50477857/spark-fails-to-read-csv-when-last-column-name-contains-spaces

In [2]:
#drop_list = ['Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
#data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)

+---+--------------------+--------------------+--------------+----------+--------------+----------+--------------------+------------+
|  X|          book_title|        review_title|   review_user|   book_id|     review_id| timestamp|         review_text|review_score|
+---+--------------------+--------------------+--------------+----------+--------------+----------+--------------------+------------+
|  1|A Gentleman in Mo...|Russian aristocra...|    Kansabelle|0143110438|R2UFCQ9WES7VFH|1555241537|A great read. In ...|           4|
|  2|A Gentleman in Mo...|Knowing nothing a...|  D.P. McHenry|0143110438|R24B1HA9J9I99G|1555241542|Great story, well...|           4|
|  3|Pet Sematary: A N...|One of King's fin...|Gordon Hoffman|198211598X|R1P137WFADSBYR|1555241649|Only the second n...|           4|
|  4|Less (Winner of t...|     Not my favorite|     R. Zocher|0316316121|R35533AKR5CBNS|1555242044|This book is t wh...|           4|
|  5|         Supermarket|      AMAZING, BOBBY|    D. Mahoney|

In [3]:
data.printSchema()

root
 |-- X: integer (nullable = true)
 |-- book_title: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_user: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- review_score: integer (nullable = true)



In [4]:
from pyspark.sql.functions import col

data.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------------+-----+
|review_score|count|
+------------+-----+
|           5| 3236|
|           4|  598|
|           3|  187|
|           1|  110|
|           2|  108|
+------------+-----+



In [5]:
data.count()

4239

In [6]:
#data.filter("review_score is NULL").show() # No nulls anymore
#data.filter("X is NULL").show() 
#data.filter("book_title is NULL").show()
#data.filter("review_title is NULL").show()
#data.filter("review_user is NULL").show()
#data.filter("book_id is NULL").show()
#data.filter("review_id is NULL").show()
#data.filter("timestamp is NULL").show()
data.filter("review_text is NULL").show() # There is one null
#data.filter("review_score is NULL").show()

+----+--------------------+--------------------+-----------+----------+--------------+----------+-----------+------------+
|   X|          book_title|        review_title|review_user|   book_id|     review_id| timestamp|review_text|review_score|
+----+--------------------+--------------------+-----------+----------+--------------+----------+-----------+------------+
|1121|Lies My Doctor To...|This book gave a ...|   mawshell|162860378X|R3SIH2LVO3EYMH|1555252626|       null|           5|
+----+--------------------+--------------------+-----------+----------+--------------+----------+-----------+------------+



In [7]:
# Remove observation where review_text is null
data = data.na.drop(subset=["review_text"])

In [8]:
data.count() # Removed!

4238

In [9]:
# Concatenate book_title, review_title and review_text into a single column
from pyspark.sql import functions as ff
data = data.withColumn('text', ff.concat(ff.col('book_title'),
                                              ff.lit(' '), 
                                              ff.col('review_title'),
                                             ff.lit(' '),
                                             ff.col('review_text')))
data.show(2, truncate = False)

+---+------------------------------+-----------------------------------------------------------------------------------------------+------------+----------+--------------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|X  |book_title                    |review_title                                                                                   |review_user |book_id   |review_id     |timestamp |review_text                                                                       

In [10]:
drop_list = ['X', 'book_title', 'review_title', 'review_user', 'book_id', 'review_id', 
             'timestamp', 'review_text']
data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)

+------------+--------------------+
|review_score|                text|
+------------+--------------------+
|           4|A Gentleman in Mo...|
|           4|A Gentleman in Mo...|
|           4|Pet Sematary: A N...|
|           4|Less (Winner of t...|
|           4|Supermarket AMAZI...|
+------------+--------------------+
only showing top 5 rows



<h3>Now the modelling pipeline starts</h3>
I got it from: https://towardsdatascience.com/multi-class-text-classification-with-pyspark-7d78d022ed35

In [11]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
import nltk
#nltk.download("stopwords")

In [12]:
# set seed for reproducibility
(trainingData, testData) = data.randomSplit([0.8, 0.2], seed = 12345)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 3333
Test Dataset Count: 905


In [13]:
# regular expression tokenizer: To split sentences into words
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

In [14]:
# stop words
stopwordList = nltk.corpus.stopwords.words('english')
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stopwordList)

In [15]:
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=20000, minDF=50)

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

# Recoding target variable
label_stringIdx = StringIndexer(inputCol = "review_score", outputCol = "label") 
labels_stars = label_stringIdx.fit(trainingData).labels # Save this levels to be able later to transform back

# Create pipeline
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

In [17]:
# Fit the pipeline to training data.
pipelineFit = pipeline.fit(trainingData)
dataset = pipelineFit.transform(trainingData)
#dataset.show(1, truncate = False)
dataset.show(5)

+------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(453,[0,1,3,5,6,1...|  4.0|
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(453,[1,2,3,5,6,8...|  4.0|
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(453,[0,1,3,5,6,3...|  4.0|
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(453,[0,1,4,5,6,1...|  4.0|
|           1|After (The After ...|[after, the, afte...|[series, eye, rol...|(453,[0,3,4,7,11,...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [18]:
# Fit the pipeline to test data.
test_dataset = pipelineFit.transform(testData)
#test_dataset.show(1, truncate = False)
test_dataset.show(5)

+------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(453,[1,2,3,4,5,6...|  4.0|
|           1|After (The After ...|[after, the, afte...|[series, baid, sw...|(453,[1,27,94,260...|  4.0|
|           1|After (The After ...|[after, the, afte...|[series, waste, t...|(453,[0,9,14,15,4...|  4.0|
|           1|Cemetery Road: A ...|[cemetery, road, ...|[cemetery, road, ...|(453,[0,1,15,131,...|  4.0|
|           1|Cemetery Road: A ...|[cemetery, road, ...|[cemetery, road, ...|(453,[1,14,32,58,...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [19]:
# Original encoding
trainingData.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show() 

+------------+-----+
|review_score|count|
+------------+-----+
|           5| 2543|
|           4|  466|
|           3|  146|
|           2|   90|
|           1|   88|
+------------+-----+



In [20]:
# Show how the encoding changed
dataset.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show() 

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 2543|
|  1.0|  466|
|  2.0|  146|
|  3.0|   90|
|  4.0|   88|
+-----+-----+



<h3>Logistic regression using count vector features</h3>

In [21]:
from pyspark.ml.feature import IndexToString

# Transform back from index to original coding
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedScore", labels = labels_stars)

In [22]:
lr = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

lrModel = lr.fit(dataset) # Fit model
predictions = lrModel.transform(test_dataset) # Predict
predictions = labelConverter.transform(predictions) # Transform labels

predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|The Woman in the Window: A ...|[1.0,2.17284525898869E-18,4...|           4|             5|
|First: Sandra Day O'Connor ...|[0.9999999999999467,5.31921...|           5|             5|
|QAnon: An Invitation to The...|[0.9999999999986744,1.13697...|           5|             5|
|Someone Knows Another great...|[0.9999999999485647,5.14352...|           5|             5|
|The Red Scrolls of Magic (T...|[0.999999999414569,4.332639...|           3|             5|
|The Mister Creating your ow...|[0.9999999637776866,3.61965...|           5|             5|
|The Woman in the Window: A ...|[0.9999999544060393,5.58719...|           5|             5|
|Pet Sematary: A Novel Facin...|[0.9999999527766081,4.72233...|           5|    

In [23]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions) # This is the accuracy

0.7033916817155373

In [24]:
predictions.head(1)

[Row(review_score=1, text="A Gentleman in Moscow: A Novel Too slow This has to be one of the most boring books I've read.  It takes chapters upon chapters to move the story forward.  I stopped reading after then seventh chapter", words=['a', 'gentleman', 'in', 'moscow', 'a', 'novel', 'too', 'slow', 'this', 'has', 'to', 'be', 'one', 'of', 'the', 'most', 'boring', 'books', 'i', 've', 'read', 'it', 'takes', 'chapters', 'upon', 'chapters', 'to', 'move', 'the', 'story', 'forward', 'i', 'stopped', 'reading', 'after', 'then', 'seventh', 'chapter'], filtered=['gentleman', 'moscow', 'novel', 'slow', 'one', 'boring', 'books', 'read', 'takes', 'chapters', 'upon', 'chapters', 'move', 'story', 'forward', 'stopped', 'reading', 'seventh', 'chapter'], features=SparseVector(453, {1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 15: 1.0, 19: 1.0, 135: 1.0, 197: 1.0, 199: 1.0, 223: 1.0, 328: 2.0, 379: 1.0}), label=4.0, rawPrediction=DenseVector([1.5018, 0.269, -1.0457, 4.3374, -5.0626]), probability=Dense

<h4> Crossvalidation </h4>

In [25]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

lr = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel = cv.fit(dataset)

predictions = cvModel.transform(test_dataset)
predictions = labelConverter.transform(predictions) # Transform labels

# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
evaluator.evaluate(predictions)

0.7261842626917203

In [26]:
best_model = cvModel.bestModel
best_model

LogisticRegressionModel: uid = LogisticRegression_ed633c6d7e1a, numClasses = 5, numFeatures = 453

In [27]:
best_reg_param = best_model._java_obj.getRegParam()
best_elasticnet_param = best_model._java_obj.getElasticNetParam()
print(best_reg_param);print(best_elasticnet_param)

0.01
0.0


In [28]:
predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|The Woman in the Window: A ...|[0.999999978876327,1.965497...|           4|             5|
|First: Sandra Day O'Connor ...|[0.9999991450731959,7.94014...|           5|             5|
|Someone Knows Another great...|[0.9999972860920413,2.66768...|           5|             5|
|QAnon: An Invitation to The...|[0.9999671627047327,1.31596...|           5|             5|
|The Red Scrolls of Magic (T...|[0.9999485073589646,1.54137...|           3|             5|
|The Mister Creating your ow...|[0.9999150742102599,6.23867...|           5|             5|
|When We Left Cuba Suspensef...|[0.9998202299922372,1.79740...|           5|             5|
|Directorate S: The C.I.A. a...|[0.9998157110227579,1.84287...|           5|    

In [29]:
# To evaluate and get confusion matrix: https://runawayhorse001.github.io/LearningApacheSpark/classification.html
#from pyspark.mllib.evaluation import MulticlassMetrics
#metrics = MulticlassMetrics(predictionCol="predictions", labelCol="label")
#metrics.evaluate(predictions)

<h3> Naive Bayes</h3>

In [30]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1)

model = nb.fit(dataset)
predictions = model.transform(test_dataset)
predictions = labelConverter.transform(predictions) # Transform labels

predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|First: Sandra Day O'Connor ...|[0.9999999995064941,4.86284...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9999977415013812,1.62760...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9999945873950603,5.31709...|           5|             5|
|Daisy Jones & The Six: A No...|[0.9999942371903164,5.03477...|           4|             5|
|Lies My Doctor Told Me Seco...|[0.9999921809201134,7.53685...|           4|             5|
|Lies My Doctor Told Me Seco...|[0.9999870737041547,1.28940...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9999866817292232,1.29695...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9999844440279964,1.37220...|           5|    

In [31]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7049903730296888

<h3>Random Forest</h3>

In [32]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel = rf.fit(dataset)
predictions = rfModel.transform(test_dataset)
predictions = labelConverter.transform(predictions) # Transform labels

predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|A Gentleman in Moscow: A No...|[0.8025993463262777,0.12106...|           5|             5|
|A Gentleman in Moscow: A No...|[0.8012346620322707,0.12235...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.8011775690836588,0.12246...|           5|             5|
|Girl, Stop Apologizing: A S...|[0.8010832253126122,0.12274...|           5|             5|
|Hashimoto’s Food Pharmacolo...|[0.8009293466066767,0.12260...|           5|             5|
|This Is Me: Loving the Pers...|[0.8009019768572124,0.12257...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.8007807650346179,0.12270...|           5|             5|
|A Gentleman in Moscow: A No...|[0.8007615139368367,0.12194...|           5|    

In [33]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6626568462275606

<h4> Crossvalidation </h4>

In [34]:
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [100, 200, 500]) # regularization parameter
             .addGrid(rf.maxDepth, [4, 10, 20]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel = cv.fit(dataset)

predictions = cvModel.transform(test_dataset)
predictions = labelConverter.transform(predictions) # Transform labels

# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
evaluator.evaluate(predictions)

0.6826623645296218

In [35]:
best_model = cvModel.bestModel
best_model

RandomForestClassificationModel (uid=RandomForestClassifier_8d6c20de5c07) with 500 trees

In [36]:
best_numTrees = best_model.getNumTrees
best_maxDepth = best_model.getOrDefault('maxDepth')
print(best_numTrees);print(best_maxDepth)

500
20


In [37]:
predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|A Gentleman in Moscow: A No...|[0.9077673122826685,0.06315...|           5|             5|
|Exceptional You!: 7 Ways to...|[0.9074103142624389,0.06276...|           5|             5|
|A Gentleman in Moscow: A No...|[0.9073340976733448,0.06276...|           5|             5|
|A Gentleman in Moscow: A No...|[0.9066922542810658,0.06376...|           5|             5|
|A Gentleman in Moscow: A No...|[0.9062303795868857,0.06194...|           5|             5|
|A Gentleman in Moscow: A No...|[0.9061756563057325,0.06402...|           4|             5|
|A Love Letter Life: Pursue ...|[0.9061325932057224,0.06401...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9056915353492222,0.06399...|           5|    

<h3>TF-IDF Features</h3>

In [38]:
from pyspark.ml.feature import HashingTF, IDF

In [39]:
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

pipeline_tfidf = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

In [40]:
# applying pipeline to training data
pipelineFit_tfidf = pipeline_tfidf.fit(trainingData)
dataset_tfidf = pipelineFit_tfidf.transform(trainingData)
dataset_tfidf.show(1)

+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|         rawFeatures|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(20000,[702,1106,...|(20000,[702,1106,...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row



In [41]:
# applying pipeline to test data
test_dataset_tfidf = pipelineFit_tfidf.transform(testData)
test_dataset_tfidf.show(1)

+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|         rawFeatures|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|(20000,[415,591,2...|(20000,[415,591,2...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row



<h3>Logistic regression TF-IDF</h3>

In [42]:
lr_tfidf = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

lrModel_tfidf = lr_tfidf.fit(dataset_tfidf)
predictions_tfidf = lrModel_tfidf.transform(test_dataset_tfidf)
predictions_tfidf = labelConverter.transform(predictions_tfidf) # Transform labels

predictions_tfidf.filter(predictions_tfidf['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|Pet Sematary: A Novel perfe...|[1.0,1.0815308445822547E-16...|           4|             5|
|The Island of Sea Women: A ...|[1.0,8.278481062705543E-17,...|           5|             5|
|Cat and Nat's Mom Truths: E...|[1.0,3.097684754256849E-17,...|           5|             5|
|The Mister EL JAMES did it ...|[1.0,2.041465137202726E-17,...|           5|             5|
|Unlearn: 101 Simple Truths ...|[1.0,1.8135263570340965E-17...|           5|             5|
|Eat to Beat Disease: The Ne...|[1.0,3.570469271328641E-18,...|           5|             5|
|Run Away Harlan Coben at Hi...|[1.0,1.372681552457108E-18,...|           4|             5|
|Accidental Presidents: Eigh...|[1.0,1.3361246436745188E-18...|           5|    

In [43]:
evaluator_tfidf = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_tfidf.evaluate(predictions_tfidf)

0.7211881145739505

In [44]:
predictions_tfidf.head(1)

[Row(review_score=1, text="A Gentleman in Moscow: A Novel Too slow This has to be one of the most boring books I've read.  It takes chapters upon chapters to move the story forward.  I stopped reading after then seventh chapter", words=['a', 'gentleman', 'in', 'moscow', 'a', 'novel', 'too', 'slow', 'this', 'has', 'to', 'be', 'one', 'of', 'the', 'most', 'boring', 'books', 'i', 've', 'read', 'it', 'takes', 'chapters', 'upon', 'chapters', 'to', 'move', 'the', 'story', 'forward', 'i', 'stopped', 'reading', 'after', 'then', 'seventh', 'chapter'], filtered=['gentleman', 'moscow', 'novel', 'slow', 'one', 'boring', 'books', 'read', 'takes', 'chapters', 'upon', 'chapters', 'move', 'story', 'forward', 'stopped', 'reading', 'seventh', 'chapter'], rawFeatures=SparseVector(20000, {415: 1.0, 591: 2.0, 2044: 1.0, 3851: 1.0, 4300: 1.0, 5290: 1.0, 5499: 1.0, 7044: 1.0, 7650: 1.0, 9504: 1.0, 11997: 1.0, 16282: 1.0, 16657: 1.0, 16735: 1.0, 17252: 1.0, 18203: 1.0, 18834: 1.0, 19254: 1.0}), features=Sparse

<h4>Cross-validation</h4>

In [45]:
lr_tfidf = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr_tfidf.regParam, [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]) # regularization parameter
             .addGrid(lr_tfidf.elasticNetParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv_tfidf = CrossValidator(estimator=lr_tfidf, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel_tfidf = cv_tfidf.fit(dataset_tfidf)

predictions_tfidf = cvModel_tfidf.transform(test_dataset_tfidf)
predictions_tfidf = labelConverter.transform(predictions_tfidf) # Transform labels

# Evaluate best model
evaluator_tfidf = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_tfidf.evaluate(predictions_tfidf)

0.737984333434717

In [46]:
best_model_tfidf = cvModel_tfidf.bestModel
best_model_tfidf

LogisticRegressionModel: uid = LogisticRegression_d8fbbc6068b1, numClasses = 5, numFeatures = 20000

In [47]:
best_reg_param_tfidf = best_model_tfidf._java_obj.getRegParam()
best_elasticnet_param_tfidf = best_model_tfidf._java_obj.getElasticNetParam()
print(best_reg_param_tfidf);print(best_elasticnet_param_tfidf)

0.01
0.2


In [48]:
predictions_tfidf.filter(predictions_tfidf['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|Someone Knows Another great...|[0.9999986067796225,9.71800...|           5|             5|
|When We Left Cuba Suspensef...|[0.9999962127269121,3.07337...|           5|             5|
|First: Sandra Day O'Connor ...|[0.9999525701237956,4.53978...|           5|             5|
|Pet Sematary: A Novel Facin...|[0.9999213115576286,7.85227...|           5|             5|
|BraveTart: Iconic American ...|[0.9997483263122545,9.94285...|           5|             5|
|A Gentleman in Moscow: A No...|[0.9995737652200387,3.02371...|           5|             5|
|Nopalito: A Mexican Kitchen...|[0.9995521253691966,1.35291...|           5|             5|
|A Gentleman in Moscow: A No...|[0.9993147045538943,1.44494...|           5|    

<h3> Naive Bayes TF-IDF</h3>

In [49]:
nb_tfidf = NaiveBayes(smoothing=1)

model_tfidf = nb_tfidf.fit(dataset_tfidf)
predictions_tfidf = model_tfidf.transform(test_dataset_tfidf)
predictions_tfidf = labelConverter.transform(predictions_tfidf) # Transform labels

predictions_tfidf.filter(predictions_tfidf['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|A Gentleman in Moscow: A No...|[1.0,9.770840117995038E-17,...|           5|             5|
|A Gentleman in Moscow: A No...|[1.0,8.261090898410058E-17,...|           5|             5|
|Lies My Doctor Told Me Seco...|[1.0,5.948914789946586E-17,...|           5|             5|
|Less (Winner of the Pulitze...|[1.0,5.816130134888463E-17,...|           5|             5|
|A Gentleman in Moscow: A No...|[1.0,5.81060272265125E-17,5...|           5|             5|
|Can't Make This Stuff Up!: ...|[1.0,5.635006428149414E-17,...|           5|             5|
|Pet Sematary: A Novel Great...|[1.0,5.348160890282632E-17,...|           4|             5|
|Pet Sematary: A Novel Shine...|[1.0,4.5550547500523756E-17...|           5|    

In [50]:
evaluator_tfidf = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_tfidf.evaluate(predictions_tfidf)

0.7441898341686789

<h3>Random Forest TF-IDF</h3>

In [51]:
from pyspark.ml.classification import RandomForestClassifier

rf_tfidf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel_tfidf = rf_tfidf.fit(dataset_tfidf)
predictions_tfidf = rfModel_tfidf.transform(test_dataset_tfidf)
predictions_tfidf = labelConverter.transform(predictions_tfidf) # Transform labels

predictions_tfidf.filter(predictions_tfidf['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|Lies My Doctor Told Me Seco...|[0.7835181621868612,0.12887...|           5|             5|
|A Love Letter Life: Pursue ...|[0.781899783057155,0.129488...|           5|             5|
|Exceptional You!: 7 Ways to...|[0.78045612728447,0.1302248...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.7803228369358045,0.13034...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.7802044223825271,0.13059...|           5|             5|
|Girl, Stop Apologizing: A S...|[0.7795375119519148,0.13011...|           5|             5|
|WOLFPACK: How to Come Toget...|[0.7793085093005128,0.13069...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.779175477103831,0.131877...|           5|    

In [52]:
evaluator_tfidf = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_tfidf.evaluate(predictions_tfidf)

0.6626568462275606

<h4> Crossvalidation </h4>

In [53]:
rf_tfidf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(rf_tfidf.numTrees, [100, 200, 500]) # regularization parameter
             .addGrid(rf_tfidf.maxDepth, [4, 10, 20]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv_tfidf = CrossValidator(estimator=rf_tfidf, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel_tfidf = cv_tfidf.fit(dataset_tfidf)

predictions_tfidf = cvModel_tfidf.transform(test_dataset_tfidf)
predictions_tfidf = labelConverter.transform(predictions_tfidf) # Transform labels

# Evaluate best model
evaluator_tfidf = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label")
evaluator_tfidf.evaluate(predictions_tfidf)

0.680264771399809

In [54]:
best_model_tfidf = cvModel_tfidf.bestModel
best_model_tfidf

RandomForestClassificationModel (uid=RandomForestClassifier_05f1ed3aab59) with 500 trees

In [55]:
best_numTrees_tfidf = best_model_tfidf.getNumTrees
best_maxDepth_tfidf = best_model_tfidf.getOrDefault('maxDepth')
print(best_numTrees_tfidf);print(best_maxDepth_tfidf)

500
20


In [56]:
predictions_tfidf.filter(predictions_tfidf['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|Lies My Doctor Told Me Seco...|[0.8458043649570385,0.09648...|           5|             5|
|A Gentleman in Moscow: A No...|[0.844207153971729,0.097321...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.8428998225423476,0.09625...|           5|             5|
|Can't Make This Stuff Up!: ...|[0.842465196358295,0.096097...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.842240359993462,0.095738...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.8421765385283241,0.09534...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.8418983692358402,0.09508...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.8414064631240836,0.09780...|           5|    

<h3> Word2Vec </h3>

In [57]:
from pyspark.ml.feature import Word2Vec

w2v = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="features")

In [58]:
pipeline_w2v = Pipeline(stages=[regexTokenizer, stopwordsRemover, w2v, label_stringIdx])

In [59]:
# applying pipeline to training data
pipelineFit_w2v = pipeline_w2v.fit(trainingData)
dataset_w2v = pipelineFit_w2v.transform(trainingData)
dataset_w2v.show(1)

+------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|[0.04381944184812...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row



In [60]:
# applying pipeline to test data
test_dataset_w2v = pipelineFit_w2v.transform(testData)
test_dataset_w2v.show(1)

+------------+--------------------+--------------------+--------------------+--------------------+-----+
|review_score|                text|               words|            filtered|            features|label|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
|           1|A Gentleman in Mo...|[a, gentleman, in...|[gentleman, mosco...|[0.02492030376666...|  4.0|
+------------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row



<h3>Logistic regression Word2Vec</h3>

In [61]:
lr_w2v = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

lrModel_w2v = lr_w2v.fit(dataset_w2v)
predictions_w2v = lrModel_w2v.transform(test_dataset_w2v)
predictions_w2v = labelConverter.transform(predictions_w2v) # Transform labels

predictions_w2v.filter(predictions_w2v['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|Lies My Doctor Told Me Seco...|[0.9646835515630324,0.00309...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9608090588343119,0.00303...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9575811593800235,0.00420...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9548220408240203,0.00547...|           5|             5|
|QAnon: An Invitation to The...|[0.9542220269819321,0.01907...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9465833107354823,0.00784...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.942610538423865,0.010679...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9413034219431073,0.00743...|           4|    

In [62]:
predictions_w2v.head(1)

[Row(review_score=1, text="A Gentleman in Moscow: A Novel Too slow This has to be one of the most boring books I've read.  It takes chapters upon chapters to move the story forward.  I stopped reading after then seventh chapter", words=['a', 'gentleman', 'in', 'moscow', 'a', 'novel', 'too', 'slow', 'this', 'has', 'to', 'be', 'one', 'of', 'the', 'most', 'boring', 'books', 'i', 've', 'read', 'it', 'takes', 'chapters', 'upon', 'chapters', 'to', 'move', 'the', 'story', 'forward', 'i', 'stopped', 'reading', 'after', 'then', 'seventh', 'chapter'], filtered=['gentleman', 'moscow', 'novel', 'slow', 'one', 'boring', 'books', 'read', 'takes', 'chapters', 'upon', 'chapters', 'move', 'story', 'forward', 'stopped', 'reading', 'seventh', 'chapter'], features=DenseVector([0.0249, 0.4134, 0.066]), label=4.0, rawPrediction=DenseVector([2.2736, 0.806, -0.483, -1.1527, -1.4439]), probability=DenseVector([0.7403, 0.1706, 0.047, 0.0241, 0.018]), prediction=0.0, predictedScore='5')]

In [63]:
evaluator_w2v = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_w2v.evaluate(predictions_w2v)

0.670660555402759

<h4>Cross-validation</h4>

In [64]:
lr_w2v = LogisticRegression(maxIter=20, regParam=0, elasticNetParam=0)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr_w2v.regParam, [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]) # regularization parameter
             .addGrid(lr_w2v.elasticNetParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv_w2v = CrossValidator(estimator=lr_w2v, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel_w2v = cv_w2v.fit(dataset_w2v)

predictions_w2v = cvModel_w2v.transform(test_dataset_w2v)
predictions_w2v = labelConverter.transform(predictions_w2v) # Transform labels

# Evaluate best model
evaluator_w2v = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_w2v.evaluate(predictions_w2v)

0.6666906585339133

In [65]:
best_model_w2v = cvModel_w2v.bestModel
best_model_w2v

LogisticRegressionModel: uid = LogisticRegression_734075164c93, numClasses = 5, numFeatures = 3

In [66]:
best_reg_param_w2v = best_model_w2v._java_obj.getRegParam()
best_elasticnet_param_w2v = best_model_w2v._java_obj.getElasticNetParam()
print(best_reg_param_w2v);print(best_elasticnet_param_w2v)

0.01
0.6


In [67]:
predictions_w2v.filter(predictions_w2v['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|Lies My Doctor Told Me Seco...|[0.956186641694379,0.017261...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9522357241955557,0.01816...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9473683604019182,0.02109...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9422775268241106,0.02403...|           5|             5|
|QAnon: An Invitation to The...|[0.9292004784338718,0.03861...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9275775703652407,0.03081...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9264838176022595,0.03047...|           4|             5|
|Lies My Doctor Told Me Seco...|[0.9205518337626275,0.03612...|           5|    

<h3> Naive Bayes Word2Vec</h3>

In [68]:
nb_w2v = NaiveBayes(smoothing=1)

model_w2v = nb_w2v.fit(dataset_w2v)
predictions_w2v = model_w2v.transform(test_dataset_w2v)
predictions_w2v = labelConverter.transform(predictions_w2v) # Transform labels

predictions_w2v.filter(predictions_w2v['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

Py4JJavaError: An error occurred while calling o134276.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 22972.0 failed 1 times, most recent failure: Lost task 0.0 in stage 22972.0 (TID 23568, localhost, executor driver): java.lang.IllegalArgumentException: requirement failed: Naive Bayes requires nonnegative feature values but found [-0.012262186447185451,0.22753394624906714,-0.028704270666492157].
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.NaiveBayes$.requireNonnegativeValues(NaiveBayes.scala:235)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$4.apply(NaiveBayes.scala:144)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$4.apply(NaiveBayes.scala:144)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$7.apply(NaiveBayes.scala:168)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$7.apply(NaiveBayes.scala:166)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188)
	at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:150)
	at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1.apply(NaiveBayes.scala:176)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1.apply(NaiveBayes.scala:129)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:183)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:183)
	at org.apache.spark.ml.classification.NaiveBayes.trainWithLabelCheck(NaiveBayes.scala:129)
	at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:118)
	at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:78)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at sun.reflect.GeneratedMethodAccessor336.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalArgumentException: requirement failed: Naive Bayes requires nonnegative feature values but found [-0.012262186447185451,0.22753394624906714,-0.028704270666492157].
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.NaiveBayes$.requireNonnegativeValues(NaiveBayes.scala:235)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$4.apply(NaiveBayes.scala:144)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$4.apply(NaiveBayes.scala:144)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$7.apply(NaiveBayes.scala:168)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$7.apply(NaiveBayes.scala:166)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188)
	at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:150)
	at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:121)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
evaluator_w2v = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_w2v.evaluate(predictions_w2v)

<h3>Random Forest Word2Vec</h3>

In [70]:
rf_w2v = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel_w2v = rf_w2v.fit(dataset_w2v)
predictions_w2v = rfModel_w2v.transform(test_dataset_w2v)
predictions_w2v = labelConverter.transform(predictions_w2v) # Transform labels

predictions_w2v.filter(predictions_w2v['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|Lies My Doctor Told Me Seco...|[0.851404883414099,0.095432...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.851404883414099,0.095432...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.851265785631412,0.095919...|           4|             5|
|Lies My Doctor Told Me Seco...|[0.851265785631412,0.095919...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.851265785631412,0.095919...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.851265785631412,0.095919...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.8510519422376285,0.09554...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.8510519422376285,0.09554...|           5|    

In [71]:
evaluator_w2v = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_w2v.evaluate(predictions_w2v)

0.6729005535062502

<h4> Crossvalidation </h4>

In [72]:
rf_w2v = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(rf_w2v.numTrees, [100, 200, 500]) # regularization parameter
             .addGrid(rf_w2v.maxDepth, [4, 10, 20]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv_w2v = CrossValidator(estimator=rf_w2v, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel_w2v = cv_w2v.fit(dataset_w2v)

predictions_w2v = cvModel_w2v.transform(test_dataset_w2v)
predictions_w2v = labelConverter.transform(predictions_w2v) # Transform labels

# Evaluate best model
evaluator_w2v = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator_w2v.evaluate(predictions_w2v)

0.6807749530095831

In [73]:
best_model_w2v = cvModel_w2v.bestModel
best_model_w2v

RandomForestClassificationModel (uid=RandomForestClassifier_340b41819664) with 200 trees

In [74]:
best_numTrees_w2v = best_model_w2v.getNumTrees
best_maxDepth_w2v = best_model_w2v.getOrDefault('maxDepth')
print(best_numTrees_w2v);print(best_maxDepth_w2v)

200
10


In [75]:
predictions_w2v.filter(predictions_w2v['prediction'] == 0) \
    .select("text","probability","review_score","predictedScore") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------------+--------------+
|                          text|                   probability|review_score|predictedScore|
+------------------------------+------------------------------+------------+--------------+
|The Mueller Report Trump Su...|[0.9690421658765567,0.02146...|           5|             5|
|A Gentleman in Moscow: A No...|[0.9675726079180089,0.02606...|           5|             5|
|The Longevity Solution: Red...|[0.967295200535754,0.021602...|           5|             5|
|Clean & Lean: 30 Days, 30 F...|[0.9652177755134765,0.02572...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9646743305732678,0.02642...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9646743305732678,0.02642...|           5|             5|
|QAnon: An Invitation to The...|[0.9644163712868223,0.02289...|           5|             5|
|Lies My Doctor Told Me Seco...|[0.9639520820527009,0.02516...|           5|    

In [76]:
predictions_w2v.head(1)

[Row(review_score=1, text="A Gentleman in Moscow: A Novel Too slow This has to be one of the most boring books I've read.  It takes chapters upon chapters to move the story forward.  I stopped reading after then seventh chapter", words=['a', 'gentleman', 'in', 'moscow', 'a', 'novel', 'too', 'slow', 'this', 'has', 'to', 'be', 'one', 'of', 'the', 'most', 'boring', 'books', 'i', 've', 'read', 'it', 'takes', 'chapters', 'upon', 'chapters', 'to', 'move', 'the', 'story', 'forward', 'i', 'stopped', 'reading', 'after', 'then', 'seventh', 'chapter'], filtered=['gentleman', 'moscow', 'novel', 'slow', 'one', 'boring', 'books', 'read', 'takes', 'chapters', 'upon', 'chapters', 'move', 'story', 'forward', 'stopped', 'reading', 'seventh', 'chapter'], features=DenseVector([0.0249, 0.4134, 0.066]), label=4.0, rawPrediction=DenseVector([139.1004, 37.6634, 16.2856, 3.5282, 3.4224]), probability=DenseVector([0.6955, 0.1883, 0.0814, 0.0176, 0.0171]), prediction=0.0, predictedScore='5')]