In [6]:
reviewPath = "gs://big_data_hw_zhl/project/verified_reviews.csv"

In [7]:
itemDf = spark.read.format("csv").option("header", "true").load(reviewPath)

In [8]:
itemDf.count()

75565

In [9]:
itemDf.filter(itemDf.helpfulVotes.isNotNull()).count()

29562

In [10]:
itemDf.filter(itemDf.title.contains("Star") & ~itemDf.title.contains("One Star")).select(["asin", "title", "body", "helpfulVotes"]).show(500)

+----------+--------------------+--------------------+------------+
|      asin|               title|                body|helpfulVotes|
+----------+--------------------+--------------------+------------+
|B00HWEJJSQ|          Five Stars|Just as described...|        null|
|B00NKR9MJA|         Three Stars|       Piece of s***|        null|
|B00NMWYA36|            one Star|      me vino dañadp|        null|
|B00SIB3HS0|          Five Stars|me lo vendieron c...|        null|
|B00YD54GG2|         Three Stars|I used that phone...|        null|
|B00YD54J8W|          Five Stars|Phone keeps power...|        null|
|B01FJT7E0A|          Five Stars|El sony expiria x...|        null|
|B01FJT7E3M|          zero Stars|The cellphone did...|        null|
|B01LEL8ABY|          Five Stars|i just use like a...|        null|
|B01MRH0YND|          Four Stars|This iPhone is fu...|        null|
|B01MRH0YND|         Three Stars|Phones looks good...|        null|
|B071JP8XDJ|          Five Stars|Edited, had to 

In [11]:
itemDf.filter(itemDf.title.contains("Stars")).select(["asin", "title", "body", "helpfulVotes"]).show(500)

+----------+-----------+--------------------+------------+
|      asin|      title|                body|helpfulVotes|
+----------+-----------+--------------------+------------+
|B00HWEJJSQ| Five Stars|Just as described...|        null|
|B00NKR9MJA|Three Stars|       Piece of s***|        null|
|B00SIB3HS0| Five Stars|me lo vendieron c...|        null|
|B00YD54GG2|Three Stars|I used that phone...|        null|
|B00YD54J8W| Five Stars|Phone keeps power...|        null|
|B01FJT7E0A| Five Stars|El sony expiria x...|        null|
|B01FJT7E3M| zero Stars|The cellphone did...|        null|
|B01LEL8ABY| Five Stars|i just use like a...|        null|
|B01MRH0YND| Four Stars|This iPhone is fu...|        null|
|B01MRH0YND|Three Stars|Phones looks good...|        null|
|B071JP8XDJ| Five Stars|Edited, had to ta...|        null|
|B002AS9WEA|  Two Stars|              to old|        null|
|B002UHS0UI|  Two Stars|IT DOES NOT KEEP ...|        null|
|B002UHS0UI|  Two Stars|Missing center bu...|        nul

In [12]:
itemDf.filter(~itemDf.body.isNotNull()).count()

13

In [13]:
# transforming prices column to priceList

from pyspark.sql.functions import udf
from pyspark.sql.types import *

def getAllReviews(title, body, helpfulVotes):
    systemTitles = ["One Star", "Two Stars", "Three Stars", "Four Stars", "Five Stars"]
    allReviews = ""
    # add title which is not null and not generated by system 
    if title != None and title not in systemTitles:
        allReviews += title + " "
    if body != None:
        allReviews += body + " "
    if helpfulVotes != None:
        allReviews += helpfulVotes + "; "
    # will use \t as delimiter for saving csv file. Thus need to clean all data
    return allReviews.replace("\t", " ")

udfGetAllReviews = udf(getAllReviews, StringType())
allReviewDf = itemDf.withColumn("allReview", udfGetAllReviews("title", "body", "helpfulVotes")).cache()

In [14]:
allReviewDf.show(10)

+----------+-----------------+------+-----------------+--------+--------------------+--------------------+--------------------+--------------------+
|      asin|             name|rating|             date|verified|               title|                body|        helpfulVotes|           allReview|
+----------+-----------------+------+-----------------+--------+--------------------+--------------------+--------------------+--------------------+
|B0009N5L7K|    Marcel Thomas|     1|    March 5, 2016|    true|        Stupid phone|DON'T BUY OUT OF ...|                null|Stupid phone DON'...|
|B0009N5L7K|   Stephen Cahill|     1|December 20, 2016|    true|       Phones locked|1 star because th...|                null|Phones locked 1 s...|
|B000SKTZ0S|Kei, San Jose, CA|     1|     May 13, 2017|    true|It seems it doesn...|"I purchased this...|                null|It seems it doesn...|
|B000SKTZ0S|           Kristy|     1|   March 13, 2019|    true|   Supply are needed|The phone did not...|

In [15]:
# get data with more review content. (helpfulVotes.isNotNull()==True)
allReview4SentimentalAnalysisDf = allReviewDf.filter(allReviewDf.helpfulVotes.isNotNull()).select(["asin", "rating", "allReview"]).cache()

In [16]:
allReview4SentimentalAnalysisDf.show(10)

+----------+------+--------------------+
|      asin|rating|           allReview|
+----------+------+--------------------+
|B000SKTZ0S|     1|This phone gave m...|
|B00B2BYU1Q|     1|and it is very di...|
|B00B2BYU1Q|     1|Sure seems like a...|
|B00BV1MVJ0|     1|BROKEN AND SEVERE...|
|B00BV1MVJ0|     1|What a piece of g...|
|B00BV1MVJ0|     1|the costumer serv...|
|B00D99ZBR6|     1|Horrible "1, when...|
|B00DUJ6TYY|     1|Great phone "I bo...|
|B00E6FGSHY|     1|Horrible phone. "...|
|B00E92B88I|     1|HORRIBLE PHONE "A...|
+----------+------+--------------------+
only showing top 10 rows



In [13]:
allReview4SentimentalAnalysisDf.count()

29562

In [14]:
allReview4SentimentalAnalysisDf.write \
    .format('csv') \
    .options(delimiter='\t') \
    .save('gs://big_data_hw_zhl/project/allReview4SentimentalAnalysisDf')