In [7]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = SparkSession.builder.appName("Chapter4-4").getOrCreate()

In [8]:
ratings = (
    spark.read.csv(
        path="ratings.csv",
        sep=",",
        header=True,
        quote='"',
        schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",
    )
    # .withColumn("timestamp", f.to_timestamp(f.from_unixtime("timestamp")))
    .select("userId", "movieId", "rating")
    .cache()
)

The ALS class has this signature:

```python
class pyspark.ml.recommendation.ALS(
    rank=10,
    maxIter=10,
    regParam=0.1,
    numUserBlocks=10,
    numItemBlocks=10,
    implicitPrefs=False,
    alpha=1.0,
    userCol="user",
    itemCol="item",
    seed=None,
    ratingCol="rating",
    nonnegative=False,
    checkpointInterval=10,
    intermediateStorageLevel="MEMORY_AND_DISK",
    finalStorageLevel="MEMORY_AND_DISK",
    coldStartStrategy="nan",
)
```

In [3]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [17]:
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
)

(training_data, validation_data) = ratings.randomSplit([8.0, 2.0])

evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="rating", predictionCol="prediction"
)

model = als.fit(training_data)
predictions = model.transform(validation_data)

In [18]:
predictions.show(10, False)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|602   |471    |4.0   |3.557564  |
|91    |471    |1.0   |2.5484085 |
|217   |471    |2.0   |2.4393833 |
|411   |471    |4.0   |3.2672832 |
|608   |471    |1.5   |3.254493  |
|307   |833    |1.0   |1.405929  |
|132   |1088   |4.0   |2.9162722 |
|555   |1088   |4.0   |3.5491993 |
|391   |1088   |1.0   |3.2473285 |
|188   |1088   |4.0   |4.0344357 |
+------+-------+------+----------+
only showing top 10 rows



In [21]:
rmse = evaluator.evaluate(predictions.na.drop())

In [22]:
print(rmse)

0.8787766063190957


In [29]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

parameter_grid = (
    ParamGridBuilder()
    .addGrid(als.rank, [1, 5, 10])
    .addGrid(als.maxIter, [20])
    .addGrid(als.regParam, [0.05, 0.1])
    .build()
)

In [24]:
type(parameter_grid)

list

In [30]:
from pprint import pprint

pprint(parameter_grid)

[{Param(parent='ALS_85406dbaac00', name='regParam', doc='regularization parameter (>= 0).'): 0.05,
  Param(parent='ALS_85406dbaac00', name='rank', doc='rank of the factorization'): 1,
  Param(parent='ALS_85406dbaac00', name='maxIter', doc='max number of iterations (>= 0).'): 20},
 {Param(parent='ALS_85406dbaac00', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
  Param(parent='ALS_85406dbaac00', name='rank', doc='rank of the factorization'): 1,
  Param(parent='ALS_85406dbaac00', name='maxIter', doc='max number of iterations (>= 0).'): 20},
 {Param(parent='ALS_85406dbaac00', name='regParam', doc='regularization parameter (>= 0).'): 0.05,
  Param(parent='ALS_85406dbaac00', name='rank', doc='rank of the factorization'): 5,
  Param(parent='ALS_85406dbaac00', name='maxIter', doc='max number of iterations (>= 0).'): 20},
 {Param(parent='ALS_85406dbaac00', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
  Param(parent='ALS_85406dbaac00', name='rank', doc='rank of

In [31]:
crossvalidator = CrossValidator(
    estimator=als,
    estimatorParamMaps=parameter_grid,
    evaluator=evaluator,
    numFolds=2,
)

crossval_model = crossvalidator.fit(training_data)
predictions = crossval_model.transform(validation_data)


In [32]:
rmse = evaluator.evaluate(predictions.na.drop())
print(rmse)

0.8786277604574072


In [33]:
model = crossval_model.bestModel