In [1]:
itemPath = "gs://big_data_hw_zhl/project/20190928-items.csv"

In [2]:
itemDf = spark.read.format("csv").option("header", "true").load(itemPath)

In [3]:
itemDf.show(10)

+----------+--------+--------------------+--------------------+--------------------+------+--------------------+------------+------+
|      asin|   brand|               title|                 url|               image|rating|           reviewUrl|totalReviews|prices|
+----------+--------+--------------------+--------------------+--------------------+------+--------------------+------------+------+
|B0000SX2UC|   Nokia|Dual-Band / Tri-M...|https://www.amazo...|https://m.media-a...|     3|https://www.amazo...|          14|  null|
|B0009N5L7K|Motorola| Motorola I265 phone|https://www.amazo...|https://m.media-a...|   2.9|https://www.amazo...|           7|$49.95|
|B000SKTZ0S|Motorola|MOTOROLA C168i AT...|https://www.amazo...|https://m.media-a...|   2.6|https://www.amazo...|          22|  null|
|B00198M12M|   Nokia|Nokia 6500 Slide ...|https://www.amazo...|https://m.media-a...|   2.4|https://www.amazo...|           5|  null|
|B001AO4OUC|Motorola|Motorola i335 Cel...|https://www.amazo...|https:

In [6]:
# get all brands
brandDf = itemDf.select("brand").distinct()

In [7]:
brandDf.show(20)

+--------+
|   brand|
+--------+
|   Nokia|
|    Sony|
|Motorola|
|  Xiaomi|
| Samsung|
|  Google|
| OnePlus|
|    ASUS|
|   Apple|
|  HUAWEI|
+--------+



In [8]:
# price range
'''
How to get a good price range estimation?
We get rating critea by walking the store.
Get all good phones -> rating > 3.8 and totalReviews > 98
'''
itemWithPriceDf = itemDf.filter(itemDf.prices.isNotNull())

In [9]:
itemWithPriceDf.show()

+----------+--------+--------------------+--------------------+--------------------+------+--------------------+------------+-------+
|      asin|   brand|               title|                 url|               image|rating|           reviewUrl|totalReviews| prices|
+----------+--------+--------------------+--------------------+--------------------+------+--------------------+------------+-------+
|B0009N5L7K|Motorola| Motorola I265 phone|https://www.amazo...|https://m.media-a...|   2.9|https://www.amazo...|           7| $49.95|
|B001DZY4KI|    Sony|Sony Ericsson G70...|https://www.amazo...|https://m.media-a...|     2|https://www.amazo...|           1| $78.99|
|B0027VKQPE|   Nokia|Nokia New 1100 fo...|https://www.amazo...|https://m.media-a...|   3.2|https://www.amazo...|           8| $99.99|
|B00280QJFU| Samsung|Samsung T301G Pre...|https://www.amazo...|https://m.media-a...|   3.5|https://www.amazo...|         133| $59.89|
|B0029X7UHC|Motorola|Motorola I205 cel...|https://www.amazo...

In [10]:
# we get reviews numeber critera by taking the avg
from pyspark.sql import functions as F
itemWithPriceDf.filter(itemWithPriceDf.rating > 3.8).agg(F.avg(F.col("totalReviews"))).show()

+-----------------+
|avg(totalReviews)|
+-----------------+
|        98.390625|
+-----------------+



In [11]:
goodPhonesWithPrice = itemWithPriceDf.filter((itemWithPriceDf.rating > 3.8) & (itemWithPriceDf.totalReviews > 98))

In [12]:
goodPhonesWithPrice.show(100)

+----------+--------+--------------------+--------------------+--------------------+------+--------------------+------------+---------------+
|      asin|   brand|               title|                 url|               image|rating|           reviewUrl|totalReviews|         prices|
+----------+--------+--------------------+--------------------+--------------------+------+--------------------+------------+---------------+
|B006OU39QW| Samsung|Verizon Samsung C...|https://www.amazo...|https://m.media-a...|     4|https://www.amazo...|         248|        $109.99|
|B0147LDSG0| Samsung|Samsung Galaxy J5...|https://www.amazo...|https://m.media-a...|     4|https://www.amazo...|         157|        $198.94|
|B01CJU9126| Samsung|Samsung Galaxy S7...|https://www.amazo...|https://m.media-a...|   3.9|https://www.amazo...|         303|        $319.99|
|B01GXAT0CE|   Apple|Apple iPhone SE, ...|https://www.amazo...|https://m.media-a...|   3.9|https://www.amazo...|         742|        $114.99|
|B01MD

In [13]:
priceRangeDf = goodPhonesWithPrice.select(["asin", "brand", "rating", "totalReviews", "prices"]).cache()

In [44]:
priceRangeDf.count()

52

In [47]:
# transforming prices column to priceList

from pyspark.sql.functions import udf
from pyspark.sql.types import *

import re

def prices2priceList(prices):
    x = re.findall("\$(\w+)", prices)
    return str(x)

udfValueToCategory = udf(prices2priceList, StringType())
priceListDf = priceRangeDf.withColumn("priceList", udfValueToCategory("prices"))

In [48]:
priceListDf.show(100)

+----------+--------+------+------------+---------------+----------------+
|      asin|   brand|rating|totalReviews|         prices|       priceList|
+----------+--------+------+------------+---------------+----------------+
|B006OU39QW| Samsung|     4|         248|        $109.99|        [u'109']|
|B0147LDSG0| Samsung|     4|         157|        $198.94|        [u'198']|
|B01CJU9126| Samsung|   3.9|         303|        $319.99|        [u'319']|
|B01GXAT0CE|   Apple|   3.9|         742|        $114.99|        [u'114']|
|B01MDMJGYT|Motorola|   4.1|         150|        $189.00|        [u'189']|
|B0728HMXFD|   Apple|     4|         306|$279.55,$399.99|[u'279', u'399']|
|B072ZWCKP5|Motorola|   4.2|         328|        $198.98|        [u'198']|
|B0731JJCRZ|   Apple|   4.1|         165|        $152.00|        [u'152']|
|B0731XJ4FB| Samsung|   3.9|         109|        $188.95|        [u'188']|
|B0733FPPDG| Samsung|   3.9|         219|        $199.00|        [u'199']|
|B07536MYBQ| Samsung|   3

In [49]:
priceListDf.coalesce(1).write.csv("gs://big_data_hw_zhl/project/goodPhoneWithPriceList.csv")