In [1]:
from pyspark.sql import SparkSession, functions as f
from pyspark.ml.recommendation import ALS

In [2]:
spark = (
    SparkSession
    .builder
    .appName("Hands-on-1")
    .master("local[*]")
    .getOrCreate()
)

In [3]:
df_ratings = (
    spark
    .read
    .csv(
        path="../../data-sets/ml-latest-small/ratings.csv", # Using small to use less memory, that only fit in my memory (32 GB RAM, 16 cores)
        encoding="UTF-8",
        header=True,
        sep=",",
        quote='"',
        schema="userId INT, movieId INT, rating DOUBLE", # Notice, have dropped the timestamp column as not needed
    )
    .cache() # for speeding up
)

In [4]:
df_ratings.show(n=5, truncate=False)
df_ratings.printSchema()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|1     |1      |4.0   |
|1     |3      |4.0   |
|1     |6      |4.0   |
|1     |47     |5.0   |
|1     |50     |5.0   |
+------+-------+------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



### Let's see the summary of ratings dataframe

In [5]:
df_ratings.summary().show()

+-------+------------------+----------------+------------------+
|summary|            userId|         movieId|            rating|
+-------+------------------+----------------+------------------+
|  count|            100836|          100836|            100836|
|   mean|326.12756356856676|19435.2957177992| 3.501556983616962|
| stddev| 182.6184914635004|35530.9871987003|1.0425292390606342|
|    min|                 1|               1|               0.5|
|    25%|               177|            1199|               3.0|
|    50%|               325|            2991|               3.5|
|    75%|               477|            8092|               4.0|
|    max|               610|          193609|               5.0|
+-------+------------------+----------------+------------------+



```python
class pyspark.ml.recommendation.ALS(
    rank=10,
    maxIter=10,
    regParam=0.1,
    numUserBlocks=10,
    numItemBlocks=10,
    implicitPrefs=False,
    alpha=1.0,
    userCol='user',
    itemCol='item',
    seed=None,
    ratingCol='rating',
    nonnegative=False,
    checkpointInterval=10,
    intermediateStorageLevel='MEMORY_AND_DISK',
    finalStorageLevel='MEMORY_AND_DISK',
    coldStartStrategy='nan',
    blockSize=4096,
)
```

### Let's instantiate our model

In [6]:
model = (
    ALS(
        userCol="userId",
        itemCol="movieId",
        ratingCol="rating",
    )
    .fit(df_ratings)
)

In [7]:
type(model)

pyspark.ml.recommendation.ALSModel

### Making prediction on df_ratings, using .transform method of class-
```python
pyspark.ml.recommendation.ALSModel
```

In [9]:
predictions = model.transform(df_ratings)

In [12]:
# predictions is a dataframe with a prediction column
predictions

DataFrame[userId: int, movieId: int, rating: double, prediction: float]

In [13]:
predictions.show(n=30, truncate=False)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|191   |148    |5.0   |4.9106784 |
|133   |471    |4.0   |3.1113226 |
|597   |471    |2.0   |3.9401305 |
|385   |471    |4.0   |3.0269897 |
|436   |471    |3.0   |3.4239593 |
|602   |471    |4.0   |3.4478495 |
|91    |471    |1.0   |2.5758777 |
|409   |471    |3.0   |3.6621046 |
|372   |471    |3.0   |2.969416  |
|599   |471    |2.5   |2.6618292 |
|603   |471    |4.0   |3.5522776 |
|182   |471    |4.5   |3.7826753 |
|218   |471    |4.0   |3.3839726 |
|474   |471    |3.0   |3.478273  |
|500   |471    |1.0   |2.1239042 |
|57    |471    |3.0   |3.651469  |
|462   |471    |2.5   |3.3151805 |
|387   |471    |3.0   |2.952231  |
|610   |471    |4.0   |3.4498801 |
|217   |471    |2.0   |2.9447467 |
|555   |471    |3.0   |3.1891325 |
|176   |471    |5.0   |3.9437947 |
|520   |471    |5.0   |3.7705052 |
|136   |471    |4.0   |3.7829053 |
|171   |471    |3.0   |3.9314106 |
|273   |471    |5.0 

In [14]:
spark.stop()