In [52]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apache.osuosl.org/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz

!pip install -q findspark
!pip install pyspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from  pyspark.ml.feature import CountVectorizer

from pyspark.sql.functions import isnan, when, count, col

In [0]:
JSON_PATH = "/content/gdrive/My Drive/Colab Datasets/Amazon_Instant_Video_5.json"
APP_NAME = "Amazon Reviews Sentiment analysis"
SPARK_URL = "local[*]"
RANDOM_SEED = 141107
TRAINING_DATA_RATIO = 0.8
RF_NUM_TREES = 10
RF_MAX_DEPTH = 4
RF_NUM_BINS = 32

In [0]:
spark = SparkSession.builder.appName(APP_NAME).master(SPARK_URL).getOrCreate()
df = spark.read.options(inferschema = "true").json(JSON_PATH)

In [56]:
df_pandas = df.toPandas()
df_pandas.head(5)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,B000H00VBQ,"[0, 0]",2.0,I had big expectations because I love English ...,"05 3, 2014",A11N155CW1UV02,AdrianaM,A little bit boring for me,1399075200
1,B000H00VBQ,"[0, 0]",5.0,I highly recommend this series. It is a must f...,"09 3, 2012",A3BC8O2KCL29V2,Carol T,Excellent Grown Up TV,1346630400
2,B000H00VBQ,"[0, 1]",1.0,This one is a real snoozer. Don't believe anyt...,"10 16, 2013",A60D5HQFOTSOM,"Daniel Cooper ""dancoopermedia""",Way too boring for me,1381881600
3,B000H00VBQ,"[0, 0]",4.0,Mysteries are interesting. The tension betwee...,"10 30, 2013",A1RJPIGRSNX4PW,"J. Kaplan ""JJ""",Robson Green is mesmerizing,1383091200
4,B000H00VBQ,"[1, 1]",5.0,"This show always is excellent, as far as briti...","02 11, 2009",A16XRPF40679KG,Michael Dobey,Robson green and great writing,1234310400


In [57]:
df_pandas.shape

(37126, 9)

In [58]:
df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [59]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
asin,37126,,,B000H00VBQ,B00LPWPMCS
overall,37126,4.209529709637451,1.1185496668776904,1.0,5.0
reviewText,37126,,,"""13 Assassins"" is a remake of a 1963 film, upd...",~Why only ten episodes? This is the first fant...
reviewTime,37126,,,"01 1, 2007","12 9, 2013"
reviewerID,37126,,,A0705654XT5UCAYOY7TH,AZXS6P5QWNMLC
reviewerName,36797,,,"Leah ""Leah""",zzdb
summary,37126,1.4428786142857144E7,3.8174361192681804E7,!!!!,~HUGE FAN HERE!
unixReviewTime,37126,1.3767946516403599E9,3.054958450744007E7,975456000,1406073600


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn

In [61]:
#3and below --ve
from pyspark.sql import functions as F
df = df.withColumn("label", F.when(F.col("overall")>=3, 1).otherwise(0))
df.show()

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+-----+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|label|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+-----+
|B000H00VBQ|  [0, 0]|    2.0|I had big expecta...| 05 3, 2014|A11N155CW1UV02|            AdrianaM|A little bit bori...|    1399075200|    0|
|B000H00VBQ|  [0, 0]|    5.0|I highly recommen...| 09 3, 2012|A3BC8O2KCL29V2|             Carol T|Excellent Grown U...|    1346630400|    1|
|B000H00VBQ|  [0, 1]|    1.0|This one is a rea...|10 16, 2013| A60D5HQFOTSOM|Daniel Cooper "da...|Way too boring fo...|    1381881600|    0|
|B000H00VBQ|  [0, 0]|    4.0|Mysteries are int...|10 30, 2013|A1RJPIGRSNX4PW|      J. Kaplan "JJ"|Robson Green is m...|    1383091200|    1|
|B000H00VBQ| 

In [62]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StopWordsRemover

tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filteredReviewText")
cv = CountVectorizer(inputCol="filteredReviewText", outputCol="features", vocabSize=300)

cv_transformer = Pipeline(stages=[tokenizer, remover, cv]).fit(df).transform(df)
cv_transformer.show()

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+-----+--------------------+--------------------+--------------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|label|               words|  filteredReviewText|            features|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+-----+--------------------+--------------------+--------------------+
|B000H00VBQ|  [0, 0]|    2.0|I had big expecta...| 05 3, 2014|A11N155CW1UV02|            AdrianaM|A little bit bori...|    1399075200|    0|[i, had, big, exp...|[big, expectation...|(300,[7,8,123,181...|
|B000H00VBQ|  [0, 0]|    5.0|I highly recommen...| 09 3, 2012|A3BC8O2KCL29V2|             Carol T|Excellent Grown U...|    1346630400|    1|[i, highly, recom...|[highly, recommen...|(3

In [63]:
(trainingData, testData) = cv_transformer.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 26063
Test Dataset Count: 11063


In [65]:
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(maxIter=10, regParam=0.1)

# Fit the model
lsvcModel = lsvc.fit(trainingData["features","label"])
predictions = lsvcModel.transform(testData["features","label"])
predictions.show()

+--------------------+-----+--------------------+----------+
|            features|label|       rawPrediction|prediction|
+--------------------+-----+--------------------+----------+
|(300,[0,1,2,4,6,1...|    1|[-4.0173416654013...|       1.0|
|(300,[0,3,15,24,3...|    1|[-1.0375220163082...|       1.0|
|(300,[0,26,36,39,...|    1|[-0.1542706798043...|       1.0|
|(300,[1,5,6,40,63...|    1|[-1.3063160857211...|       1.0|
|(300,[35,108,118,...|    1|[-0.9258748141492...|       1.0|
|(300,[17,40,146,1...|    1|[-1.2499991501117...|       1.0|
|(300,[0,5,10,65,7...|    1|[-1.4395543838246...|       1.0|
|(300,[6,8,16,18,4...|    1|[-1.4510323109307...|       1.0|
|(300,[56,88,96,14...|    1|[-0.8534146386072...|       1.0|
|(300,[39,75,79,10...|    1|[-1.3863830729098...|       1.0|
|(300,[1,14,22,27,...|    1|[-1.0091194362847...|       1.0|
|(300,[0,5,60,112,...|    1|[-1.4031373523216...|       1.0|
|(300,[71,84,118],...|    1|[-0.9278216015808...|       1.0|
|(300,[0,1,2,3,4,6...|  

In [74]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction")
accuracy = evaluator.evaluate(predictions)
print("Model Accuracy: ", accuracy)

Model Accuracy:  0.800827638727323
