In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext(master='local',appName='Ex5')
spark = SparkSession(sc)

In [3]:
from pyspark.sql.functions import col

## Data Exploring

In [4]:
data = spark.read.csv('./data/ratings_Office_Products.csv',inferSchema=True,header=False)

In [5]:
data.show()

+--------------+----------+---+----------+
|           _c0|       _c1|_c2|       _c3|
+--------------+----------+---+----------+
|A2UESEUCI73CBO|0078800242|5.0|1374192000|
|A3BBNK2R5TUYGV|0113000316|5.0|1359417600|
| A5J78T14FJ5DU|0113000316|3.0|1318723200|
|A2P462UH5L6T57|043928631X|5.0|1356912000|
|A2E0X1MWNRTQF4|0439340039|1.0|1379721600|
| AAYGDWCI3LDQP|0439394058|5.0|1405382400|
| AI7SARYVM8FGA|0439394058|4.0|1212624000|
|A1BUVOGGFTGMBN|0439394058|2.0|1389744000|
|A35UM9B38X7AYS|0439394058|5.0|1386201600|
| AM2U8KJROWYKR|0439394058|4.0|1346976000|
|A1V1VJ3W1SYJU0|0439394058|5.0|1374969600|
|A2GLL6RNXXGBCM|0439394058|5.0|1395014400|
|A1I4RQ7F65L5VZ|0439394058|4.0|1394841600|
|A39L54MWP4C86L|0439394058|5.0|1349308800|
| AYVV0R6U3HRIR|0439394058|2.0|1399593600|
|A2VBRY9MELAPYT|0439394058|5.0|1381795200|
|A3340VCHPJTEC8|0439394058|5.0|1372118400|
|A1U3WTFJZO6CJL|0439394058|5.0|1386806400|
|A1LSX9PO1Q2XF1|0439394058|5.0|1370390400|
|A33X18AKYV9GBB|0439394058|5.0|1386028800|
+----------

In [6]:
data = data.select(col('_c0').alias('reviewerID'),
            col('_c1').alias('asin'),
            col('_c2').alias('overall'),
            col('_c3').alias('unixReviewTime'))

In [7]:
data.printSchema()

root
 |-- reviewerID: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- unixReviewTime: integer (nullable = true)



## Cleaning Data

In [8]:
data.show()

+--------------+----------+-------+--------------+
|    reviewerID|      asin|overall|unixReviewTime|
+--------------+----------+-------+--------------+
|A2UESEUCI73CBO|0078800242|    5.0|    1374192000|
|A3BBNK2R5TUYGV|0113000316|    5.0|    1359417600|
| A5J78T14FJ5DU|0113000316|    3.0|    1318723200|
|A2P462UH5L6T57|043928631X|    5.0|    1356912000|
|A2E0X1MWNRTQF4|0439340039|    1.0|    1379721600|
| AAYGDWCI3LDQP|0439394058|    5.0|    1405382400|
| AI7SARYVM8FGA|0439394058|    4.0|    1212624000|
|A1BUVOGGFTGMBN|0439394058|    2.0|    1389744000|
|A35UM9B38X7AYS|0439394058|    5.0|    1386201600|
| AM2U8KJROWYKR|0439394058|    4.0|    1346976000|
|A1V1VJ3W1SYJU0|0439394058|    5.0|    1374969600|
|A2GLL6RNXXGBCM|0439394058|    5.0|    1395014400|
|A1I4RQ7F65L5VZ|0439394058|    4.0|    1394841600|
|A39L54MWP4C86L|0439394058|    5.0|    1349308800|
| AYVV0R6U3HRIR|0439394058|    2.0|    1399593600|
|A2VBRY9MELAPYT|0439394058|    5.0|    1381795200|
|A3340VCHPJTEC8|0439394058|    

### Check missing value:

In [9]:
from pyspark.sql.functions import when, count, isnan, isnull

In [10]:
data.select([count(when(isnan(col),col)).alias(col) for col in data.columns]).toPandas()

Unnamed: 0,reviewerID,asin,overall,unixReviewTime
0,0,0,0,0


In [11]:
data.select([count(when(isnull(col),col)).alias(col) for col in data.columns]).toPandas()

Unnamed: 0,reviewerID,asin,overall,unixReviewTime
0,0,0,0,0


**Nhận xét :**

- Dữ liệu tốt không có dữ liệu nào bị thiếu. 

### Check duplicate values:

In [12]:
duplicate_row = data.count() - data.distinct().count()
duplicate_row

0

**Nhận xét:**
- Dữ liệu không bị trùng lặp

### Encoder

In [13]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

In [14]:
data.columns

['reviewerID', 'asin', 'overall', 'unixReviewTime']

In [15]:
string_idx = StringIndexer(inputCols=['reviewerID', 'asin'], outputCols=['reviewerID_idx', 'asin_idx'])

In [16]:
final_data = string_idx.fit(data).transform(data)

In [17]:
final_data.show(3)

+--------------+----------+-------+--------------+--------------+--------+
|    reviewerID|      asin|overall|unixReviewTime|reviewerID_idx|asin_idx|
+--------------+----------+-------+--------------+--------------+--------+
|A2UESEUCI73CBO|0078800242|    5.0|    1374192000|      112059.0| 71702.0|
|A3BBNK2R5TUYGV|0113000316|    5.0|    1359417600|      621354.0| 51103.0|
| A5J78T14FJ5DU|0113000316|    3.0|    1318723200|       26236.0| 51103.0|
+--------------+----------+-------+--------------+--------------+--------+
only showing top 3 rows



In [18]:
train_data , test_data = final_data.randomSplit([0.8,0.2])

## Build model

In [19]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [20]:
als = ALS(maxIter=10, 
          regParam=0.01,
          userCol='reviewerID_idx', 
          itemCol='asin_idx', 
          ratingCol='overall',
         alpha=1)

In [21]:
model = als.fit(train_data)

### Evaluate the Result

In [22]:
prediction = model.transform(test_data)

In [23]:
prediction.show()

+--------------+----------+-------+--------------+--------------+--------+----------+
|    reviewerID|      asin|overall|unixReviewTime|reviewerID_idx|asin_idx|prediction|
+--------------+----------+-------+--------------+--------------+--------+----------+
|A37H473BZHX9P0|B003QR87I6|    5.0|    1370390400|        2142.0|   148.0| -3.506613|
|A1B4WOF6SC74PS|B003QR87I6|    5.0|    1392508800|      228359.0|   148.0|       NaN|
|A3ETZLY3N8VSZE|B003QR87I6|    4.0|    1368057600|      640481.0|   148.0|       NaN|
|A2B0UOUFIDIRE3|B003QR87I6|    4.0|    1395014400|       96591.0|   148.0| 0.3777519|
|A1CN0KDI3C0M1O|B003QR87I6|    5.0|    1378425600|       69896.0|   148.0|       NaN|
|A32BD3SXG903HT|B003QR87I6|    1.0|    1302998400|      572109.0|   148.0|       NaN|
|A32R7W1DRKEX3O|B003QR87I6|    4.0|    1364515200|      574472.0|   148.0|       NaN|
|A350ZGMBOU5A9N|B003QR87I6|    5.0|    1316390400|      587016.0|   148.0|       NaN|
|A2HD02N80BHG98|B003QR87I6|    4.0|    1396915200|    

In [24]:
prediction.where(col('reviewerID_idx')==1).show()

+--------------+----------+-------+--------------+--------------+--------+----------+
|    reviewerID|      asin|overall|unixReviewTime|reviewerID_idx|asin_idx|prediction|
+--------------+----------+-------+--------------+--------------+--------+----------+
|A104N0PWB1WMY6|B004ZIAP5I|    5.0|    1381881600|           1.0| 62484.0| 3.8169193|
|A104N0PWB1WMY6|B004ZI96U8|    5.0|    1392595200|           1.0| 38004.0|  3.974701|
|A104N0PWB1WMY6|B00C2ST59K|    5.0|    1383177600|           1.0| 49723.0| 4.2776594|
|A104N0PWB1WMY6|B005TA1M80|    5.0|    1392854400|           1.0| 63850.0| 3.8169193|
|A104N0PWB1WMY6|B004ZIANOQ|    5.0|    1381795200|           1.0| 28313.0| 3.7245648|
|A104N0PWB1WMY6|B00FY8UZ76|    5.0|    1403481600|           1.0|126785.0|       NaN|
|A104N0PWB1WMY6|B002TMMEVA|    5.0|    1393113600|           1.0| 58338.0| 3.8169193|
|A104N0PWB1WMY6|B00C2STDEM|    5.0|    1383523200|           1.0| 68801.0|       NaN|
|A104N0PWB1WMY6|B009MPAAFO|    5.0|    1381363200|    