**Final project for Big Data: analysis on reviews of gourmet food purchases on Amazon**

Pyspark configuration

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [0]:
from bs4 import BeautifulSoup
import requests

In [0]:
url = 'https://downloads.apache.org/spark/' 
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc)

In [4]:
link_files = []
for link in soup.find_all('a'):
  link_files.append(link.get('href'))
spark_link = [x for x in link_files if 'spark' in x]  
print(spark_link)

['spark-2.3.4/', 'spark-2.4.5/', 'spark-3.0.0-preview2/']


In [5]:
ver_spark = spark_link[1][:-1]
print(ver_spark)

spark-2.4.5


In [0]:
import os
link = "https://www-us.apache.org/dist/spark/"
os.system(f"wget -q {link}{ver_spark}/{ver_spark}-bin-hadoop2.7.tgz")
os.system(f"tar xf {ver_spark}-bin-hadoop2.7.tgz")


!pip install -q pyspark

In [0]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{ver_spark}-bin-hadoop2.7"

In [0]:
from pyspark.sql import SparkSession
import os
MAX_MEMORY = "5g"
spark = SparkSession.builder \
                    .appName('Test_spark').master("local[*]")\
                    .config("spark.executor.memory", MAX_MEMORY) \
                    .config("spark.driver.memory", MAX_MEMORY) \
                    .getOrCreate()

Download Amazon Grocery and Gourmet Food Reviews

In [9]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Grocery_and_Gourmet_Food_5.json.gz

--2020-05-29 00:24:35--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Grocery_and_Gourmet_Food_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 146631394 (140M) [application/octet-stream]
Saving to: ‘Grocery_and_Gourmet_Food_5.json.gz’


2020-05-29 00:24:43 (18.9 MB/s) - ‘Grocery_and_Gourmet_Food_5.json.gz’ saved [146631394/146631394]



In [0]:
df = spark.read.json("Grocery_and_Gourmet_Food_5.json.gz")

Printing the scheme and seeing the columns of the dataset

In [11]:
df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- image: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- style: struct (nullable = true)
 |    |-- Color:: string (nullable = true)
 |    |-- Design:: string (nullable = true)
 |    |-- Edition:: string (nullable = true)
 |    |-- Flavor Name:: string (nullable = true)
 |    |-- Flavor:: string (nullable = true)
 |    |-- Format:: string (nullable = true)
 |    |-- Item Display Weight:: string (nullable = true)
 |    |-- Item Package Quantity:: string (nullable = true)
 |    |-- Number of Items:: string (nullable = true)
 |    |-- Package Quantity:: string (nullable = true)
 |    |-- Package Type:: string (nullable = true)
 |    |-- Product Packaging:: string (nullable = true)
 |    |-- Scent Name:: string (n

We can show the first rows

In [12]:
df.show()

+----------+-----+-------+--------------------+-----------+--------------+-----------------+-----+--------------------+--------------+--------+----+
|      asin|image|overall|          reviewText| reviewTime|    reviewerID|     reviewerName|style|             summary|unixReviewTime|verified|vote|
+----------+-----+-------+--------------------+-----------+--------------+-----------------+-----+--------------------+--------------+--------+----+
|4639725183| null|    5.0| No adverse comment.|11 19, 2014|A1QVBUH9E1V6I8|   Jamshed Mathur| null|          Five Stars|    1416355200|    true|null|
|4639725183| null|    5.0|Gift for college ...|10 13, 2016|A3GEOILWLK86XM|        itsjustme| null|      Great product.|    1476316800|    true|null|
|4639725183| null|    5.0|If you like stron...|11 21, 2015|A32RD6L701BIGP|  Krystal Clifton| null|              Strong|    1448064000|    true|null|
|4639725183| null|    5.0|Love the tea. The...|08 12, 2015|A2UY1O1FBGKIE6|          U. Kane| null|        

In [13]:
df_dataset = df.limit(3)
df_dataset.toPandas()

Unnamed: 0,asin,image,overall,reviewText,reviewTime,reviewerID,reviewerName,style,summary,unixReviewTime,verified,vote
0,4639725183,,5.0,No adverse comment.,"11 19, 2014",A1QVBUH9E1V6I8,Jamshed Mathur,,Five Stars,1416355200,True,
1,4639725183,,5.0,Gift for college student.,"10 13, 2016",A3GEOILWLK86XM,itsjustme,,Great product.,1476316800,True,
2,4639725183,,5.0,"If you like strong tea, this is for you. It mi...","11 21, 2015",A32RD6L701BIGP,Krystal Clifton,,Strong,1448064000,True,


Describing the columns

In [14]:
df_describe = df.describe().toPandas().transpose()
df_describe

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
asin,1143860,8.022473689795918E9,2.3749861105498543E9,4639725043,B01HJF6FRA
overall,1143860,4.442208836745755,1.0639007882174811,1.0,5.0
reviewText,1143470,2.7032967032967035,10.389625707614412,\n\n\n\n\n\nI,~Who doesn't like Oreos? Great product~
reviewTime,1143860,,,"01 1, 2007","12 9, 2017"
reviewerID,1143860,,,A0096681Y127OL1H8W3U,AZZZ5UJWUVCYZ
reviewerName,1143722,1.407770617721789E8,5.76922280597008E8,funfunfunfun,~~Trish~~
summary,1143641,36.583333333333336,148.125289701912,,~~~~~~~~~~~ The Stinking Rose
unixReviewTime,1143860,1.4417709420271711E9,5.969109372305015E7,965779200,1538438400
vote,158202,6.631209977685486,18.01094167062252,1060,99


We want to analize reviews and overall so we focus in this two columns

In [0]:
df_sort = df.sort("reviewText")

In [16]:
df_sort.head()

Row(asin='B0000CNU0C', image=None, overall=5.0, reviewText=None, reviewTime='12 12, 2016', reviewerID='A29IPYI7CGEM70', reviewerName='Amazon Customer', style=None, summary='Five Stars', unixReviewTime=1481500800, verified=True, vote=None)

In [17]:
df_dataset = df_sort.limit(20)
df_dataset.toPandas()

Unnamed: 0,asin,image,overall,reviewText,reviewTime,reviewerID,reviewerName,style,summary,unixReviewTime,verified,vote
0,B0006OCZ4E,,5.0,,"10 7, 2015",A2B9A7UJSX2SYA,Emma V. Aguilar,"(None, None, None, None, None, None, None, Non...",Five Stars,1444176000,True,
1,B0007LXU0Y,,5.0,,"03 29, 2015",APA3RAMHRYXAQ,Carolyn Mineo,"(None, None, None, None, Chocolate Almond and...",Five Stars,1427587200,True,
2,B0000CNU0C,,5.0,,"12 12, 2016",A29IPYI7CGEM70,Amazon Customer,,Five Stars,1481500800,True,
3,B00099XOVO,,5.0,,"07 7, 2015",A1JRZK0W6SILYS,Memete,,Five Stars,1436227200,True,
4,B00061EOVO,,5.0,,"11 11, 2016",AEFTYRHRFJFOF,Kindle Customer,,Five Stars,1478822400,True,
5,B00099XOQO,,5.0,,"06 24, 2015",A1JRZK0W6SILYS,Memete,,Five Stars,1435104000,True,
6,B0001DMTPU,,5.0,,"10 7, 2015",A2B9A7UJSX2SYA,Emma V. Aguilar,"(None, None, None, None, None, None, None, Non...",Five Stars,1444176000,True,
7,B0009F3PM6,,5.0,,"04 4, 2018",A2YA1ACC6QYBIP,TCSS,"(None, None, None, None, Linden Flower with H...",,1522800000,True,
8,B0001UXQ9Q,[https://images-na.ssl-images-amazon.com/image...,5.0,,"05 8, 2016",A2YKKMPOZSU08Y,Amazon Customer,"(None, None, None, None, None, None, None, Non...",Good,1462665600,True,4.0
9,B0009F3SC8,,5.0,,"05 31, 2016",A2RDG3NV6H0214,Dunia72,"(None, None, None, None, Green Tea Blueberry ...",Five Stars,1464652800,True,


At this point we realize that column reviewText have a lot of nulls and \\n 

First we count nulls in reviewText

In [18]:
df.where(df.reviewText.isNull()).count()

390

Dropping nulls from reviewText

In [0]:
df_new = df.na.drop(subset=["reviewText"])

In [20]:
df_new.where(df.reviewText.isNull()).count()

0

Replacing \\n for '  '

In [0]:
from pyspark.sql.functions import *
df_new = df_new.withColumn('reviewText', regexp_replace('reviewText', '\n', ' '))

Deleting empty left spaces and empty right spaces

In [0]:
import pyspark.sql.functions as func
df_new = df_new.withColumn('reviewText', func.ltrim(func.rtrim(df_new['reviewText'])))

Describing data after treatment

In [23]:
df_describe = df_new.describe().toPandas().transpose()
df_describe

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
asin,1143470,8.022473689795918E9,2.3749861105498543E9,4639725043,B01HJF6FRA
overall,1143470,4.442092927667538,1.063964040922202,1.0,5.0
reviewText,1143470,2.7032967032967035,10.389625707614412,! ordered in past single cherries sugar coated...,~Who doesn't like Oreos? Great product~
reviewTime,1143470,,,"01 1, 2007","12 9, 2017"
reviewerID,1143470,,,A0096681Y127OL1H8W3U,AZZZ5UJWUVCYZ
reviewerName,1143332,1.4158923709394148E8,5.784877074682183E8,funfunfunfun,~~Trish~~
summary,1143292,36.583333333333336,148.125289701912,,~~~~~~~~~~~ The Stinking Rose
unixReviewTime,1143470,1.4417567933507657E9,5.969365688469212E7,965779200,1538438400
vote,158184,6.631509404140983,18.011930890747053,1060,99


Selecting reviewText column and show first rows

In [24]:
df_new.select("reviewText").distinct().show()

+--------------------+
|          reviewText|
+--------------------+
|Sugar Free - 19 E...|
|These colors are ...|
|My friends had us...|
|Would rather have...|
|Very small jar.  ...|
|UPDATE:-  I purch...|
|Easy to use at a ...|
|I love peanuts bu...|
|            Good tea|
|             Amazing|
|The Bonsai tree i...|
|Beautiful on arri...|
|I had recently ju...|
|Entirely differen...|
|Taste great, make...|
|perfect! fast shi...|
|   Outrageously good|
|The best butter b...|
|This is the only ...|
|Great pepper. Lot...|
+--------------------+
only showing top 20 rows



Working with reviewText and overall columns

In [25]:
data = df_new.rdd\
    .map(lambda x: (x["overall"], x["reviewText"]))\
    .toDF(["overall", "reviewText"])

data.show()

+-------+--------------------+
|overall|          reviewText|
+-------+--------------------+
|    5.0| No adverse comment.|
|    5.0|Gift for college ...|
|    5.0|If you like stron...|
|    5.0|Love the tea. The...|
|    5.0|I have searched e...|
|    4.0|Tea made with Lip...|
|    5.0|I love this tea! ...|
|    5.0|Discovered this t...|
|    4.0|Well I bought thi...|
|    5.0|We really like th...|
|    5.0|Hard to find in t...|
|    5.0|I make the best b...|
|    3.0|I have recently s...|
|    5.0|I like pretty muc...|
|    5.0|I was watching a ...|
|    3.0|it was ok, but it...|
|    5.0|Great taste use i...|
|    5.0|Best tea for my s...|
|    4.0|Good tea. Way bet...|
|    1.0|This tea looks li...|
+-------+--------------------+
only showing top 20 rows



It's time to clean up the data. We search for numbers, punctuation, multiple space, etc.

In [26]:
!pip install gensim



In [0]:
import gensim.parsing.preprocessing as gsp
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from gensim import utils


filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(x):
    s = x[1]
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return (x[0],s)

In [28]:
data.take(1)[0][1]

'No adverse comment.'

In [29]:
clean_text(data.take(1)[0])[1]

'advers comment'

Applying gensim function

In [30]:
cleaned_rdd = data.rdd.map(lambda x : clean_text(x))
cleaned_df = cleaned_rdd.toDF()
cleaned_df.show()

+---+--------------------+
| _1|                  _2|
+---+--------------------+
|5.0|      advers comment|
|5.0| gift colleg student|
|5.0|like strong tea l...|
|5.0|love tea flavor w...|
|5.0|search brows amaz...|
|4.0|tea lipton yellow...|
|5.0|love tea okai hig...|
|5.0|discov tea local ...|
|4.0|bought tea malays...|
|5.0|like tea definit ...|
|5.0|hard exactli desc...|
|5.0|best brew ic tea ...|
|3.0|recent start drin...|
|5.0|like pretti lipto...|
|5.0|watch youtub vide...|
|3.0|tast like lipton ...|
|5.0|great tast us col...|
|5.0|best tea singl cu...|
|4.0|good tea wai bett...|
|1.0|tea look like cof...|
+---+--------------------+
only showing top 20 rows



Dropping duplicates after cleaning data

In [0]:
cleaned_df = cleaned_df.dropDuplicates()

In [0]:
cleaned_df = cleaned_df.dropDuplicates(['_2'])

In [33]:
cleaned_df.show()

+---+--------------------+
| _1|                  _2|
+---+--------------------+
|5.0|kid love last lon...|
|5.0|gummi delici tast...|
|5.0|great tast easi u...|
|5.0|great subtl lime ...|
|5.0|  student love candi|
|5.0|great help make l...|
|5.0|start sai think l...|
|4.0|perfect classroom...|
|5.0|love coffe rich b...|
|3.0|clump lot sure cl...|
|5.0|favorit brand keu...|
|4.0|daughter like nat...|
|1.0|order husband cra...|
|3.0|   littl pricei tast|
|5.0|larg hand size bo...|
|5.0|like review grew ...|
|5.0|delici fresh flav...|
|4.0|    nice sweet snack|
|1.0|                clog|
|2.0|bought box bake b...|
+---+--------------------+
only showing top 20 rows



Split data (train and test)

In [0]:
train_df, test_df = cleaned_df.randomSplit([0.7, 0.3])

## Working in ML pipeline

Apache Spark provides Word2Vec that transforms each document into a vector using the average of all words in the document

First working with training data

In [0]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import Word2Vec
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="_2", outputCol="tokens")
word2Vec = Word2Vec(vectorSize=300, minCount=0, inputCol="tokens", outputCol="features")
pipeline = Pipeline(stages=[tokenizer,word2Vec])

In [0]:
model = pipeline.fit(train_df)

In [0]:
doc_train_df = model.transform(train_df)

In [38]:
doc_train_df.show()

+---+--------------------+--------------------+--------------------+
| _1|                  _2|              tokens|            features|
+---+--------------------+--------------------+--------------------+
|1.0|absolut underwhel...|[absolut, underwh...|[-0.0516571965513...|
|1.0|acquir tast could...|[acquir, tast, co...|[-0.0368508747778...|
|1.0|amaz review read ...|[amaz, review, re...|[-0.0665831222237...|
|1.0|amazon return ite...|[amazon, return, ...|[-0.0779496141207...|
|1.0|arriv complet sta...|[arriv, complet, ...|[-0.0427331910891...|
|1.0|         arriv crumb|      [arriv, crumb]|[-0.1141674779355...|
|1.0|artifici color ho...|[artifici, color,...|[-0.0904802464480...|
|1.0|assum marmit supp...|[assum, marmit, s...|[-0.0936670112423...|
|1.0|         at got sick|     [at, got, sick]|[-0.0845449590124...|
|1.0|aw tast sage look...|[aw, tast, sage, ...|[-0.1058548859913...|
|1.0|bad tast like ras...|[bad, tast, like,...|[-0.0269948372927...|
|1.0|bewar contamin fo...|[bewar, 

Working with test data

In [0]:
tokenizerT = Tokenizer(inputCol="_2", outputCol="tokens")
word2VecT = Word2Vec(vectorSize=300, minCount=0, inputCol="tokens", outputCol="features")
pipelineT = Pipeline(stages=[tokenizerT,word2VecT])

In [0]:
modelT = pipelineT.fit(test_df)

In [0]:
doc_test_df = modelT.transform(test_df)

Random Forest Model

In [0]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

rf_classifier=RandomForestClassifier(labelCol="_1", featuresCol="features")
rf_predictions=rf_classifier.fit(doc_train_df).transform(doc_test_df)

rf_model_evaluator=MulticlassClassificationEvaluator(labelCol="_1", predictionCol="prediction", metricName="accuracy")

In [48]:
accuracy = rf_model_evaluator.evaluate(rf_predictions)
print("Accuracy = %g" % (accuracy))

Accuracy = 0.691443


Logistic Regression Model

In [0]:
from pyspark.ml.classification import LogisticRegression
lr_classifier=LogisticRegression(family="multinomial", labelCol="_1")

lr_predictions=lr_classifier.fit(doc_train_df).transform(doc_test_df)
lr_model_evaluator=MulticlassClassificationEvaluator(labelCol="_1", predictionCol="prediction", metricName="accuracy")

In [50]:
accuracy = lr_model_evaluator.evaluate(lr_predictions)
print("Accuracy = %g" % (accuracy))

Accuracy = 0.627892
