<h3>Importing the data</h3>

In [132]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', 
                                                                  inferschema='true', 
                                                                  quote = '"', 
                                                                  escape = '"',
                                                                 multiline = 'true',
                                                                 ignoreTrailingWhiteSpace = 'true').load('Data\\data.csv')

# There were some problems reading the data, here I found the solutions
# https://stackoverflow.com/questions/40413526/reading-csv-files-with-quoted-fields-containing-embedded-commas
#https://stackoverflow.com/questions/50477857/spark-fails-to-read-csv-when-last-column-name-contains-spaces

In [133]:
#drop_list = ['Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
#data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)

+---+--------------------+--------------------+--------------+----------+--------------+----------+--------------------+------------+
|  X|          book_title|        review_title|   review_user|   book_id|     review_id| timestamp|         review_text|review_score|
+---+--------------------+--------------------+--------------+----------+--------------+----------+--------------------+------------+
|  1|A Gentleman in Mo...|Russian aristocra...|    Kansabelle|0143110438|R2UFCQ9WES7VFH|1555241537|A great read. In ...|           4|
|  2|A Gentleman in Mo...|Knowing nothing a...|  D.P. McHenry|0143110438|R24B1HA9J9I99G|1555241542|Great story, well...|           4|
|  3|Pet Sematary: A N...|One of King's fin...|Gordon Hoffman|198211598X|R1P137WFADSBYR|1555241649|Only the second n...|           4|
|  4|Less (Winner of t...|     Not my favorite|     R. Zocher|0316316121|R35533AKR5CBNS|1555242044|This book is t wh...|           4|
|  5|         Supermarket|      AMAZING, BOBBY|    D. Mahoney|

In [134]:
data.printSchema()

root
 |-- X: integer (nullable = true)
 |-- book_title: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- review_user: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- review_score: integer (nullable = true)



In [135]:
from pyspark.sql.functions import col

data.groupBy("review_score") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------------+-----+
|review_score|count|
+------------+-----+
|           5| 3236|
|           4|  598|
|           3|  187|
|           1|  110|
|           2|  108|
+------------+-----+



In [136]:
data.count()

4239

In [144]:
#data.filter("review_score is NULL").show() # No nulls anymore
#data.filter("X is NULL").show() 
#data.filter("book_title is NULL").show()
#data.filter("review_title is NULL").show()
#data.filter("review_user is NULL").show()
#data.filter("book_id is NULL").show()
#data.filter("review_id is NULL").show()
#data.filter("timestamp is NULL").show()
data.filter("review_text is NULL").show() # There is one null
#data.filter("review_score is NULL").show()

+---+----------+------------+-----------+-------+---------+---------+-----------+------------+
|  X|book_title|review_title|review_user|book_id|review_id|timestamp|review_text|review_score|
+---+----------+------------+-----------+-------+---------+---------+-----------+------------+
+---+----------+------------+-----------+-------+---------+---------+-----------+------------+



In [149]:
# Remove observation where review_text is null
data = data.na.drop(subset=["review_text"])

In [150]:
data.count() # Removed!

4238

<h3>Now the modelling pipeline starts</h3>
I got it from: https://towardsdatascience.com/multi-class-text-classification-with-pyspark-7d78d022ed35

In [152]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="review_text", outputCol="words", pattern="\\W")
# stop words
add_stopwords = ["http","https","amp","rt","t","c","the"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [156]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

label_stringIdx = StringIndexer(inputCol = "review_score", outputCol = "label") 
# In this line above, the encoding of the target variable changed:
# 5 is now 0
# 4 is now 1
# 3 is now 2
# 1 is now 3
# 2 is now 4
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)

+---+--------------------+--------------------+--------------+----------+--------------+----------+--------------------+------------+--------------------+--------------------+--------------------+-----+
|  X|          book_title|        review_title|   review_user|   book_id|     review_id| timestamp|         review_text|review_score|               words|            filtered|            features|label|
+---+--------------------+--------------------+--------------+----------+--------------+----------+--------------------+------------+--------------------+--------------------+--------------------+-----+
|  1|A Gentleman in Mo...|Russian aristocra...|    Kansabelle|0143110438|R2UFCQ9WES7VFH|1555241537|A great read. In ...|           4|[a, great, read, ...|[a, great, read, ...|(4185,[1,2,3,4,7,...|  1.0|
|  2|A Gentleman in Mo...|Knowing nothing a...|  D.P. McHenry|0143110438|R24B1HA9J9I99G|1555241542|Great story, well...|           4|[great, story, we...|[great, story, we...|(4185,[11,28,

In [158]:
# Show how the encoding changed
dataset.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show() 

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 3235|
|  1.0|  598|
|  2.0|  187|
|  3.0|  110|
|  4.0|  108|
+-----+-----+



In [159]:
# set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 12345)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 3333
Test Dataset Count: 905


<h3>Logistic regression using count vector features</h3>

In [162]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("review_text","review_score","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------+------------------------------+-----+----------+
|                   review_text|review_score|                   probability|label|prediction|
+------------------------------+------------+------------------------------+-----+----------+
|A Gentleman in Moscow was a...|           5|[0.997549675457473,0.002126...|  0.0|       0.0|
|When We Left Cuba by Chanel...|           5|[0.9941138638357997,0.00423...|  0.0|       0.0|
|Reviewed by Sharon Thérèse ...|           5|[0.9941014451392723,0.00107...|  0.0|       0.0|
|If you're in need of cake, ...|           5|[0.9935674806273247,0.00349...|  0.0|       0.0|
|I've been following Stella ...|           5|[0.9924354394575713,0.00361...|  0.0|       0.0|
|If you don't find yourself ...|           5|[0.992276189543571,0.005088...|  0.0|       0.0|
|Received my copy yesterday,...|           5|[0.9890898400637897,0.00521...|  0.0|       0.0|
|I believe this book will be...|           5|[0.987431961763

In [166]:
predictions.head(1)

[Row(X=5, book_title='Supermarket', review_title='AMAZING, BOBBY', review_user='D. Mahoney', book_id='1982127139', review_id='R1D6LXSDR0CRMN', timestamp=1555242169, review_text="As someone who hates reading and never really got into any books in school, this book is a masterpiece. Don't think, just buy it asap. You won't regret it.", review_score=4, words=['as', 'someone', 'who', 'hates', 'reading', 'and', 'never', 'really', 'got', 'into', 'any', 'books', 'in', 'school', 'this', 'book', 'is', 'a', 'masterpiece', 'don', 't', 'think', 'just', 'buy', 'it', 'asap', 'you', 'won', 't', 'regret', 'it'], filtered=['as', 'someone', 'who', 'hates', 'reading', 'and', 'never', 'really', 'got', 'into', 'any', 'books', 'in', 'school', 'this', 'book', 'is', 'a', 'masterpiece', 'don', 'think', 'just', 'buy', 'it', 'asap', 'you', 'won', 'regret', 'it'], features=SparseVector(4185, {0: 1.0, 3: 1.0, 5: 2.0, 6: 1.0, 7: 1.0, 8: 1.0, 9: 1.0, 14: 1.0, 18: 1.0, 43: 1.0, 50: 1.0, 59: 1.0, 65: 1.0, 69: 1.0, 70:

In [167]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions) # This is the accuracy

0.7115821172137888

<h3>Logistic Regression using TF-IDF Features</h3>

In [168]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 12345)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("review_text","review_score","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------+------------------------------+-----+----------+
|                   review_text|review_score|                   probability|label|prediction|
+------------------------------+------------+------------------------------+-----+----------+
|A Gentleman in Moscow was a...|           5|[0.9977097070609728,0.00180...|  0.0|       0.0|
|We humans like to think we ...|           5|[0.9962329444391925,0.00187...|  0.0|       0.0|
|When We Left Cuba by Chanel...|           5|[0.9944781428791521,0.00446...|  0.0|       0.0|
|If you're in need of cake, ...|           5|[0.9942286540178279,0.00231...|  0.0|       0.0|
|Burnout by Emily (PhD) & Am...|           5|[0.9939724071927272,0.00186...|  0.0|       0.0|
|What's great:* Expanding th...|           4|[0.9922029187079426,0.00246...|  1.0|       0.0|
|Rachel Hollis did, yet agai...|           5|[0.989122019507554,0.006504...|  0.0|       0.0|
|I really enjoyed this book ...|           5|[0.989101589600

In [169]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7117679410509941

<h3>Cross-validation</h3>

In [170]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 12345)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7221962047615811

<h3> Naive Bayes </h3>

In [171]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("review_text","review_score","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------+------------------------------+-----+----------+
|                   review_text|review_score|                   probability|label|prediction|
+------------------------------+------------+------------------------------+-----+----------+
|When We Left Cuba by Chanel...|           5|[1.0,6.594524434807532E-20,...|  0.0|       0.0|
|I really enjoyed this book ...|           5|[1.0,3.4941127111910437E-20...|  0.0|       0.0|
|We humans like to think we ...|           5|[1.0,1.5877606440176648E-24...|  0.0|       0.0|
|I love reading about health...|           4|[0.9999999999999998,2.23135...|  1.0|       0.0|
|Let's be honest. There is N...|           5|[0.9999999999999993,6.58594...|  0.0|       0.0|
|The voice acting in this bo...|           5|[0.9999999999999984,1.51433...|  0.0|       0.0|
|I knew right away when I sa...|           5|[0.9999999999999918,8.32501...|  0.0|       0.0|
|If you enjoyed the historic...|           4|[0.999999999999

In [172]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7299769820931572

<h3>Random Forest</h3>

In [174]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("review_text","review_score","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------+------------------------------+-----+----------+
|                   review_text|review_score|                   probability|label|prediction|
+------------------------------+------------+------------------------------+-----+----------+
|Loved this story. Beautiful...|           4|[0.7836402391282297,0.12735...|  1.0|       0.0|
|Very informative about Keto...|           5|[0.7836402391282297,0.12735...|  0.0|       0.0|
|One of the most enjoyable b...|           5|[0.7836402391282297,0.12735...|  0.0|       0.0|
|                Wonderful book|           4|[0.7836402391282297,0.12735...|  1.0|       0.0|
|Love the easy to follow roa...|           5|[0.7830368095617698,0.12784...|  0.0|       0.0|
|Poignant and very satisfyin...|           5|[0.7828176595848821,0.12763...|  0.0|       0.0|
|Wonderful book. Beautiful p...|           5|[0.7827614778707869,0.12824...|  0.0|       0.0|
|Well written,wonderful char...|           5|[0.782761477870

In [175]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6777035552331478