In [1]:
from pyspark.sql import SparkSession, functions as f
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import  RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [2]:
spark = (
    SparkSession
    .builder
    .appName("Hands-on-2")
    .getOrCreate()
)

In [3]:
df_ratings = (
    spark
    .read
    .csv(
        path="../../data-sets/ml-latest-small/ratings.csv",
        encoding="UTF-8",
        header=True,
        sep=",",
        quote='"',
        schema="userId INT, movieId INT, rating DOUBLE",
    )
    .cache()
)

In [4]:
df_ratings.show(n=5)
df_ratings.printSchema()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
+------+-------+------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [5]:
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction",
)

model = als.fit(df_ratings)

In [6]:
predictions = model.transform(df_ratings)

In [7]:
predictions.show(n=10)
predictions.printSchema()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   191|    148|   5.0| 4.9297347|
|   133|    471|   4.0| 3.2152045|
|   597|    471|   2.0| 4.0085406|
|   385|    471|   4.0| 3.3811128|
|   436|    471|   3.0| 3.5208173|
|   602|    471|   4.0| 3.5010448|
|    91|    471|   1.0|  2.445023|
|   409|    471|   3.0| 3.4316573|
|   372|    471|   3.0| 2.9174552|
|   599|    471|   2.5| 2.8954353|
+------+-------+------+----------+
only showing top 10 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- prediction: float (nullable = false)



In [8]:
model.userFactors.show(n=5, truncate=False)

+---+---------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                   |
+---+---------------------------------------------------------------------------------------------------------------------------+
|10 |[0.31201747, -0.17938858, 0.15557252, 0.78966224, -0.42821676, 1.352444, -0.070634305, -0.07513054, -0.026014643, 1.443979]|
|20 |[0.29125655, 0.27197567, -0.3590232, 0.90615726, -1.2176069, 0.22594482, -0.5948918, 0.7772299, -0.38546714, 1.0759174]    |
|30 |[0.22459081, -0.115073286, -0.6438237, 1.4620582, -0.9533095, 0.4615792, -0.42670465, -0.12914932, -0.16771397, 1.0337924] |
|40 |[-0.9375402, 0.17264089, -0.14390844, 1.2114497, -0.78587115, -0.33476546, -0.13498156, 0.73558533, -0.31722298, 1.0872016]|
|50 |[-0.28890848, 0.15933083, -0.18356502, 1.000538, -0.4172648, -0.40999547, -0.6172192,

In [9]:
model.itemFactors.show(n=5, truncate=False)

+---+----------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                    |
+---+----------------------------------------------------------------------------------------------------------------------------+
|10 |[-0.469974, -0.14568347, -0.9402919, 1.2925159, -0.73094994, 0.26085767, -0.14938031, -0.046538725, -0.50040627, 0.81097925]|
|20 |[0.11602994, -0.6213116, -0.15869027, 0.8187811, -1.153036, 0.2879887, -0.39606032, 0.14882378, -0.04251895, 0.58639055]    |
|30 |[-0.8230819, -0.026631294, 0.39107653, 1.3568213, -0.5299762, -0.84508294, -0.735634, 0.67914414, 0.055350915, -0.34147218] |
|40 |[-0.9704534, -0.66428685, -0.15510106, 1.572923, 0.091763034, 0.19297075, -0.18669344, -0.12169195, -0.012725148, 0.6942066]|
|50 |[-0.21576905, -0.4739233, -0.40830356, 1.5390491, -0.9294805, -0.19142205, -0.

In [10]:
rmse = evaluator.evaluate(predictions)

In [11]:
rmse

0.5926578722191046

### As we can see, the rmse of our model is about 0.59. It's not bad, but we can see that we are majorly overfitting on our data with our model. This happened, because we didn't held any testing data to test our model. So let's split the data we have into training and testing data

In [12]:
df_train, df_val = df_ratings.randomSplit([8.0, 2.0]) # the numbers inside the list should add up to 10 i.e; 100%. Here I am splitting into two sets with 80% in first and 20% in the second

In [13]:
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
)

# let's fit our model on df_train instead of df_ratings
model = als.fit(df_train)

In [14]:
# let's make predictions on our validation data for evaluation
predictions = model.transform(df_val)

In [15]:
predictions.show(n=15, truncate=False)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|602   |471    |4.0   |3.3421779 |
|603   |471    |4.0   |2.621055  |
|57    |471    |3.0   |3.9584088 |
|217   |471    |2.0   |2.9968648 |
|32    |471    |3.0   |3.9091992 |
|260   |471    |4.5   |2.7593756 |
|373   |471    |5.0   |3.5842373 |
|609   |833    |3.0   |1.0989308 |
|492   |833    |4.0   |0.97820824|
|47    |1088   |4.0   |2.4766352 |
|474   |1088   |3.5   |3.1090312 |
|594   |1088   |4.5   |4.262006  |
|221   |1088   |3.0   |3.2366261 |
|425   |1342   |3.5   |2.4487479 |
|503   |1342   |1.0   |3.6643238 |
+------+-------+------+----------+
only showing top 15 rows



In [16]:
rmse = evaluator.evaluate(predictions)

In [17]:
rmse

nan

### So, why did this happened. This happened because our predictions on our validation data might have nan values. So we will have to filter out the nan rows before evaluating to get more realistic rmse value

In [18]:
rmse = evaluator.evaluate(predictions.na.drop())

In [19]:
rmse

0.883968225787856

### Let's use a cross-validator from pyspark package to do cross validation on our data

### This is the class definition of ALS model of pyspark

```python
class pyspark.ml.recommendation.ALS(
    rank=10, # Hyperparameter
    maxIter=10, # Hyperparameter
    regParam=0.1, # Hyperparameter
    numUserBlocks=10, # Performance
    numItemBlocks=10, # Performance
    implicitPrefs=False, # For Implicit recommendation
    alpha=1.0, # For Implicit recommendation
    userCol='user',
    itemCol='item',
    seed=None,
    ratingCol='rating',
    nonnegative=False,
    checkpointInterval=10,
    intermediateStorageLevel='MEMORY_AND_DISK',
    finalStorageLevel='MEMORY_AND_DISK',
    coldStartStrategy='nan',
    blockSize=4096,
)
```

In [20]:
# Let's build a parameter grid that will represents the possible values of hyperparameters we want to search for
parameter_grid = (
    ParamGridBuilder()
    .addGrid(als.rank, [2, 5, 7, 10])
    .addGrid(als.maxIter, [10, 20])
    .addGrid(als.regParam, [0.05, 0.1])
    .build()
)

In [21]:
type(parameter_grid)

list

In [22]:
parameter_grid

[{Param(parent='ALS_bc3e75cb79ae', name='rank', doc='rank of the factorization'): 2,
  Param(parent='ALS_bc3e75cb79ae', name='maxIter', doc='max number of iterations (>= 0).'): 10,
  Param(parent='ALS_bc3e75cb79ae', name='regParam', doc='regularization parameter (>= 0).'): 0.05},
 {Param(parent='ALS_bc3e75cb79ae', name='rank', doc='rank of the factorization'): 2,
  Param(parent='ALS_bc3e75cb79ae', name='maxIter', doc='max number of iterations (>= 0).'): 10,
  Param(parent='ALS_bc3e75cb79ae', name='regParam', doc='regularization parameter (>= 0).'): 0.1},
 {Param(parent='ALS_bc3e75cb79ae', name='rank', doc='rank of the factorization'): 2,
  Param(parent='ALS_bc3e75cb79ae', name='maxIter', doc='max number of iterations (>= 0).'): 20,
  Param(parent='ALS_bc3e75cb79ae', name='regParam', doc='regularization parameter (>= 0).'): 0.05},
 {Param(parent='ALS_bc3e75cb79ae', name='rank', doc='rank of the factorization'): 2,
  Param(parent='ALS_bc3e75cb79ae', name='maxIter', doc='max number of ite

In [23]:
cross_validator = CrossValidator(
    estimator=als,
    estimatorParamMaps=parameter_grid,
    evaluator=evaluator,
    numFolds=2,
)

cross_validator_model = cross_validator.fit(df_train)

In [24]:
predictions = cross_validator_model.transform(df_val)

In [25]:
rmse = evaluator.evaluate(predictions.na.drop())

In [26]:
rmse

0.9013639035785049

### The cross_validator_model has an attribute called .bestModel which reference to the best model found during cross validation tuning

In [27]:
best_model = cross_validator_model.bestModel

In [28]:
type(best_model)

pyspark.ml.recommendation.ALSModel

In [29]:
predictions = best_model.transform(df_val)

In [30]:
rmse = evaluator.evaluate(predictions.dropna())

In [31]:
rmse

0.9013639035785049

In [32]:
spark.stop()