# Sử dụng ALS để đánh giá

In [32]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('recommendation').getOrCreate()

In [42]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

# Tập dữ liệu movieLen

In [51]:
data = spark.read.csv('C:/MainProgramLearning/TLCN/ml-latest-small/ratings.csv', inferSchema=True, header=True)
data.printSchema()


root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [52]:
## đếm các giá trị null
from pyspark.sql.functions import col,sum
data.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in data.columns)).show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     0|      0|     0|        0|
+------+-------+------+---------+



In [53]:
from pyspark.sql.functions import lit, col

rows = data.count()
summary = data.describe().filter(col("summary") == "count")
summary.select(*((lit(rows)-col(c)).alias(c) for c in data.columns)).show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|   0.0|    0.0|   0.0|      0.0|
+------+-------+------+---------+



In [54]:
## đếm số dòng dữ liệu
print('No. of row: %d' % data.count())
data.show(5)

No. of row: 100836
+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [55]:
# count, mean, std, min & max
data.describe().show()

+-------+------------------+----------------+------------------+--------------------+
|summary|            userId|         movieId|            rating|           timestamp|
+-------+------------------+----------------+------------------+--------------------+
|  count|            100836|          100836|            100836|              100836|
|   mean|326.12756356856676|19435.2957177992| 3.501556983616962|1.2059460873684695E9|
| stddev| 182.6184914635004|35530.9871987003|1.0425292390606342|2.1626103599513078E8|
|    min|                 1|               1|               0.5|           828124615|
|    max|               610|          193609|               5.0|          1537799250|
+-------+------------------+----------------+------------------+--------------------+



In [57]:
## chia tập dữ liệu thành train và test với 80% cho train và 20% cho test 1 cách ngẫu nhiên
train_data, test_data = data.randomSplit([0.8, 0.2])


## Xây dựng mô hình trên tập dữ liệu train

In [58]:
## xây dựng mô hình gợi ý trên tập dữ liệu train
als = ALS(maxIter=10, regParam=0.1, rank=8, nonnegative=True, coldStartStrategy="drop",\
          userCol='userId', itemCol='movieId', ratingCol='rating')
model = als.fit(train_data)

In [59]:
print('Factorized user matrix with rank = %d' % model.rank)
model.userFactors.show(5)

print('-'*50)

print('Factorized item matrix with rank = %d' % model.rank)
model.itemFactors.show(5)

Factorized user matrix with rank = 8
+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[0.016918354, 0.1...|
| 20|[1.1431886, 1.172...|
| 30|[0.3461848, 1.051...|
| 40|[1.0088524, 0.923...|
| 50|[0.9413194, 0.862...|
+---+--------------------+
only showing top 5 rows

--------------------------------------------------
Factorized item matrix with rank = 8
+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[0.24432674, 0.65...|
| 20|[1.0726507, 0.567...|
| 30|[1.0779108, 0.872...|
| 50|[0.9589908, 1.095...|
| 60|[0.5851913, 0.483...|
+---+--------------------+
only showing top 5 rows



In [60]:
## gợi ý cho top người dùng các bộ phim
print('Recommended top users (e.g. 1 top user) for all items with the corresponding predicted ratings:')
model.recommendForAllItems(1).show(5)

print('-'*50)
### gợi ý các top các bộ phim cho người dùng
print('Recommended top items (e.g. 1 top item) for all users with the corresponding predicted ratings:')
model.recommendForAllUsers(1).show(5)

Recommended top users (e.g. 1 top user) for all items with the corresponding predicted ratings:
+-------+------------------+
|movieId|   recommendations|
+-------+------------------+
|     12|[{406, 4.6147637}]|
|     26| [{43, 4.8257837}]|
|     27| [{492, 4.630498}]|
|     28| [{236, 5.897657}]|
|     31|[{543, 4.7701597}]|
+-------+------------------+
only showing top 5 rows

--------------------------------------------------
Recommended top items (e.g. 1 top item) for all users with the corresponding predicted ratings:
+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{148881, 5.98182...|
|     2|[{131724, 4.88258...|
|     3| [{86320, 5.719838}]|
|     4| [{6650, 5.0799174}]|
|     5| [{6201, 4.9934406}]|
+------+--------------------+
only showing top 5 rows



# Dự đoán trên tập test

In [61]:
predictions = model.transform(test_data)
predictions.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|   580|   3175|   2.5|1167792674| 3.5045094|
|   580|  44022|   3.5|1167792560| 2.9032965|
|   368|   2122|   2.0| 971277319| 1.9476635|
|   115|   1645|   4.0| 957648208| 3.3740087|
|   385|    471|   4.0| 850766697| 2.6788354|
|   159|   1088|   4.0|1508641161| 3.5256314|
|   606|   1580|   2.5|1171310310| 3.1264918|
|   606|  44022|   4.0|1259446275|  2.721818|
|    91|   3175|   3.5|1112713784|  3.396537|
|   230|   1580|   3.5|1196304359| 2.9356136|
|   493|   2366|   2.0|1001562846| 3.2491574|
|   233|   1580|   3.0|1529334057| 2.7495682|
|   246|   1645|   4.0|1354125095| 3.8889596|
|   599|    833|   1.5|1519330029| 1.3695393|
|   599|   1088|   2.5|1498515232| 2.4884472|
|   599|   7340|   2.0|1519140628| 2.4902742|
|   111|   4900|   4.0|1516153877| 1.1239411|
|   140|   3175|   2.5|1075154390| 3.5865796|
|   177|   1580|   3.5|1435890494|

In [62]:
predictions.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- prediction: float (nullable = false)



# Đánh giá mô hình

In [63]:
# Kiểm tra root squared error
evaluator = RegressionEvaluator(metricName='rmse', predictionCol='prediction', labelCol='rating')
rmse = evaluator.evaluate(predictions)
print('Root mean squared error of the test_data: %.4f' % rmse)

Root mean squared error of the test_data: 0.8760


RMSE mô tả mức độ lỗi của cột xếp hạng.