In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.estointernet.in/apache/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar xf /content/spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


!curl -L http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz -o data.json.gz


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2402k  100 2402k    0     0  5123k      0 --:--:-- --:--:-- --:--:-- 5112k


In [None]:
data = spark.read.json('data.json.gz')
data = data.select('overall','reviewText')
data = data.withColumnRenamed('overall', 'label')
data.show(5)


+-----+--------------------+
|label|          reviewText|
+-----+--------------------+
|  5.0|Not much to write...|
|  5.0|The product does ...|
|  5.0|The primary job o...|
|  5.0|Nice windscreen p...|
|  5.0|This pop filter i...|
+-----+--------------------+
only showing top 5 rows



In [None]:
from pyspark.sql.functions import col

# by top 20 categories
data.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-----+-----+
|label|count|
+-----+-----+
|  5.0| 6938|
|  4.0| 2084|
|  3.0|  772|
|  2.0|  250|
|  1.0|  217|
+-----+-----+



In [None]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W")
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [None]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)

In [None]:
dataset.show(5)

+-----+--------------------+--------------------+--------------------+--------------------+
|label|          reviewText|               words|            filtered|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|  5.0|Not much to write...|[not, much, to, w...|[not, much, to, w...|(6241,[3,4,5,10,1...|
|  5.0|The product does ...|[the, product, do...|[product, does, e...|(6241,[0,1,2,3,4,...|
|  5.0|The primary job o...|[the, primary, jo...|[primary, job, of...|(6241,[1,2,3,4,5,...|
|  5.0|Nice windscreen p...|[nice, windscreen...|[nice, windscreen...|(6241,[2,4,5,9,10...|
|  5.0|This pop filter i...|[this, pop, filte...|[this, pop, filte...|(6241,[1,2,3,7,8,...|
+-----+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [None]:
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 8245
Test Dataset Count: 2016


### Logistic Regression using Count Vector Features

In [None]:
# Build the model
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

# Train model with Training Data
lrModel = lr.fit(trainingData)

In [None]:
predictions = lrModel.transform(testData)
predictions.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|          reviewText|               words|            filtered|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  1.0|A dull, inept ver...|[a, dull, inept, ...|[a, dull, inept, ...|(6238,[0,1,2,5,16...|[-5.5048086389689...|[1.61112895811618...|       5.0|
|  1.0|As soon as I atta...|[as, soon, as, i,...|[as, soon, as, i,...|(6238,[0,1,2,3,4,...|[-5.5122729131854...|[1.15381763174392...|       5.0|
|  1.0|Behringer pedals ...|[behringer, pedal...|[behringer, pedal...|(6238,[0,1,2,3,4,...|[-5.5118898575513...|[1.58625180288567...|       5.0|
|  1.0|Cheap crappy stra...|[cheap, crappy, s...|[cheap, crappy, s...|(6238,[0,1,3,6,7,...|[-5.5064983128512...|[1.22545647953306.

In [None]:
predictions.groupBy("prediction") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+----------+-----+
|prediction|count|
+----------+-----+
|       5.0| 1854|
|       4.0|  138|
|       3.0|   17|
|       1.0|    4|
|       2.0|    3|
+----------+-----+



AnalysisException: ignored

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.5850292335286155

### Logistic Regression using TF-IDF Features

In [None]:
from pyspark.ml.feature import HashingTF, IDF

# Add HashingTF and IDF to transformation
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

# Redo Pipeline
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf])

In [None]:
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)

### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

# Build the model
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

# Train model with Training Data
lrModel = lr.fit(trainingData)

In [None]:
predictions = lrModel.transform(testData)

predictions.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|          reviewText|               words|            filtered|         rawFeatures|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  1.0|A dull, inept ver...|[a, dull, inept, ...|[a, dull, inept, ...|(10000,[281,307,5...|(10000,[281,307,5...|[-5.3870118248319...|[1.02401779146080...|       5.0|
|  1.0|As soon as I atta...|[as, soon, as, i,...|[as, soon, as, i,...|(10000,[30,307,32...|(10000,[30,307,32...|[-5.3957233665731...|[1.53910898532990...|       5.0|
|  1.0|At the time I bou...|[at, the, time, i...|[at, time, i, bou...|(10000,[66,73,307...|(10000,[66,73,307...|[-5.4046419324577...|[9.39656181457770...|       5.0|
|  1

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.5809867151843398

### Cross Validation

In [None]:
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors])

pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

# Build the model
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
            #  .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
            #  .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing

# Use test set here so we can measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6026977407505653

### Naive Bayes

In [None]:
from pyspark.ml.classification import NaiveBayes

# create the trainer and set its parameters
nb = NaiveBayes(smoothing=1)

# train the model
model = nb.fit(trainingData)

In [None]:
predictions = model.transform(testData)
predictions.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|          reviewText|               words|            filtered|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  1.0|A dull, inept ver...|[a, dull, inept, ...|[a, dull, inept, ...|(6238,[0,1,2,5,16...|[-168.93197062592...|[0.04871653546620...|       4.0|
|  1.0|As soon as I atta...|[as, soon, as, i,...|[as, soon, as, i,...|(6238,[0,1,2,3,4,...|[-484.07448249759...|[4.13149408129242...|       3.0|
|  1.0|At the time I bou...|[at, the, time, i...|[at, time, i, bou...|(6238,[0,1,2,3,4,...|[-740.75543516216...|[2.48428820288274...|       2.0|
|  1.0|Behringer pedals ...|[behringer, pedal...|[behringer, pedal...|(6238,[0,1,2,3,4,...|[-476.47381701631...|[8.29781727370592.

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.07800250135527137

### Random Forest

In [None]:
from pyspark.ml.classification import RandomForestClassifier

# Create an initial RandomForest model.
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Training Data
rfModel = rf.fit(trainingData)

In [None]:
predictions = rfModel.transform(testData)

predictions.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|          reviewText|               words|            filtered|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  1.0|A dull, inept ver...|[a, dull, inept, ...|[a, dull, inept, ...|(6238,[0,1,2,5,16...|[0.0,2.1158318980...|[0.0,0.0211583189...|       5.0|
|  1.0|As soon as I atta...|[as, soon, as, i,...|[as, soon, as, i,...|(6238,[0,1,2,3,4,...|[0.0,2.0609565502...|[0.0,0.0206095655...|       5.0|
|  1.0|At the time I bou...|[at, the, time, i...|[at, time, i, bou...|(6238,[0,1,2,3,4,...|[0.0,2.2863737863...|[0.0,0.0228637378...|       5.0|
|  1.0|Behringer pedals ...|[behringer, pedal...|[behringer, pedal...|(6238,[0,1,2,3,4,...|[0.0,2.0961344066...|[0.0,0.0209613440.

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.5428979485562693

### Cross Validation

In [None]:
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [50, 100, 200]) # number of trees
             .addGrid(rf.maxDepth, [3, 4, 5]) # maximum depth
#            .addGrid(rf.maxBins, [24, 32, 40]) #Number of bins
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

# Run cross validations
cvModel = cv.fit(trainingData)

# Use test set here so we can measure the accuracy of our model on new data
predictions = cvModel.transform(testData)

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.5428979485562693