In [6]:
from pyspark.sql import SparkSession

In [7]:
# Local mode
spark = SparkSession\
        .builder\
        .appName("movielens")\
        .getOrCreate()

In [8]:
# Check spark app name
spark.sparkContext.appName

'movielens'

In [9]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [10]:
ratings = spark.read.text("../data/sample_movielens_ratings.txt")\
  .rdd.toDF()\
  .selectExpr("split(value , '::') as col")\
  .selectExpr(
    "cast(col[0] as int) as userId",
    "cast(col[1] as int) as movieId",
    "cast(col[2] as float) as rating",
    "cast(col[3] as long) as timestamp")

In [11]:
training, test = ratings.randomSplit([0.8, 0.2])

### ALS模型參數
* #### setMaxIter(5)預設10，資料迭代停止前的總次數
* #### setRegParam(0.01)控制正規化防止過度擬合
* #### setUserCol("userId")用戶欄位指定
* #### setItemCol("movieId")商品欄位指定
* #### setRatingCol("rating")評分明確偏好

In [23]:
als = ALS()\
  .setMaxIter(5)\
  .setRegParam(0.01)\
  .setUserCol("userId")\
  .setItemCol("movieId")\
  .setRatingCol("rating")

In [24]:
als.explainParams()

"alpha: alpha for implicit preference (default: 1.0)\ncheckpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)\ncoldStartStrategy: strategy for dealing with unknown or new users/items at prediction time. This may be useful in cross-validation or production scenarios, for handling user/item ids the model has not seen in the training data. Supported values: 'nan', 'drop'. (default: nan)\nfinalStorageLevel: StorageLevel for ALS model factors. (default: MEMORY_AND_DISK)\nimplicitPrefs: whether to use implicit preference (default: False)\nintermediateStorageLevel: StorageLevel for intermediate datasets. Cannot be 'NONE'. (default: MEMORY_AND_DISK)\nitemCol: column name for item ids. Ids must be within the integer value range. (default: item, current: movieId)\nmaxIter: max number of iteratio

In [20]:
alsModel = als.fit(training)

In [25]:
predictions = alsModel.transform(test)

#### 用戶 [電影 評分]

In [26]:
alsModel.recommendForAllUsers(10)\
  .selectExpr("userId", "explode(recommendations)").show()

+------+---------------+
|userId|            col|
+------+---------------+
|    28|[46, 5.0261135]|
|    28| [81, 4.941173]|
|    28|  [12, 4.75358]|
|    28| [16, 4.467726]|
|    28|[49, 4.0676527]|
|    28| [64, 3.987451]|
|    28|[89, 3.8712454]|
|    28| [2, 3.8694558]|
|    28|[55, 3.4491677]|
|    28|[82, 3.3782291]|
|    26|[75, 6.6787486]|
|    26|[30, 5.5597935]|
|    26|[22, 5.0792084]|
|    26|  [7, 5.051689]|
|    26| [24, 4.988696]|
|    26| [88, 4.919421]|
|    26|[23, 4.8587737]|
|    26|[83, 4.7085357]|
|    26| [51, 4.650466]|
|    26|[79, 4.5782394]|
+------+---------------+
only showing top 20 rows



#### 電影 [用戶 評分]

In [27]:
alsModel.recommendForAllItems(10)\
  .selectExpr("movieId", "explode(recommendations)").show()

+-------+---------------+
|movieId|            col|
+-------+---------------+
|     31|  [9, 4.557351]|
|     31|[12, 3.9841657]|
|     31|[25, 3.6980495]|
|     31|[15, 2.8681436]|
|     31| [7, 2.8301177]|
|     31| [8, 2.7107399]|
|     31|[19, 2.4522147]|
|     31|[21, 2.1454089]|
|     31|[22, 2.0538256]|
|     31|[28, 1.5791872]|
|     85|[16, 4.8968735]|
|     85|  [3, 4.717561]|
|     85|[22, 4.0167265]|
|     85| [7, 3.8910358]|
|     85|[19, 3.3958688]|
|     85|[21, 3.1226978]|
|     85|   [1, 3.11516]|
|     85| [26, 3.106388]|
|     85| [17, 2.614169]|
|     85|[10, 2.6000056]|
+-------+---------------+
only showing top 20 rows

