In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 35 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 60.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=4f1ecafe8708b2c012cb6d56ae803b7dbae9a0d353b298d00e48e904afa25649
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [34]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rec').getOrCreate()

In [35]:
data = spark.read.csv('/content/movielens_ratings.csv', inferSchema=True, header=True)

In [36]:
data.printSchema()
data.show(10)

root
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- userId: integer (nullable = true)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
|      9|   4.0|     0|
|     11|   1.0|     0|
|     12|   2.0|     0|
|     15|   1.0|     0|
|     17|   1.0|     0|
|     19|   1.0|     0|
|     21|   1.0|     0|
+-------+------+------+
only showing top 10 rows



In [37]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

train, test = data.randomSplit([0.7,0.3])
als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating')

model = als.fit(train)

In [38]:
predictions = model.transform(test)
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|      1|   1.0|     3|-0.14773405|
|      1|   1.0|     4| -0.2824202|
|      1|   1.0|     5| -0.5702088|
|      1|   1.0|     6| 0.74194396|
|      1|   3.0|    25|  2.2893977|
|      3|   1.0|     1| 0.90007734|
|      3|   2.0|     8|  2.7005637|
|      3|   1.0|     9| 0.33832836|
|      3|   1.0|    13| 0.44174924|
|      3|   1.0|    21|  1.8836802|
|      3|   1.0|    29|-0.17699172|
|      2|   4.0|    10|  2.6191254|
|      2|   2.0|    20|  1.9174312|
|      2|   1.0|    25|  0.8812808|
|      0|   1.0|     3|  1.2546744|
|      0|   1.0|     5|-0.51088625|
|      0|   1.0|     8|  4.6085067|
|      0|   1.0|    19| 0.15383244|
|      0|   1.0|    21|  2.3582363|
|      0|   1.0|    22|  1.3118157|
+-------+------+------+-----------+
only showing top 20 rows



In [40]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse}")

RMSE: 1.873044425633317


In [41]:
single_user = test.filter(test['userId']==11).select(['movieId','userId'])
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|      9|    11|
|     11|    11|
|     12|    11|
|     18|    11|
|     22|    11|
|     32|    11|
|     37|    11|
|     40|    11|
|     43|    11|
|     47|    11|
|     48|    11|
|     51|    11|
|     67|    11|
|     70|    11|
|     75|    11|
|     78|    11|
|     89|    11|
+-------+------+



In [42]:
recommendations = model.transform(single_user)
recommendations.orderBy('prediction', ascending=False).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|     40|    11|  4.042645|
|     37|    11| 3.9410903|
|     32|    11|  3.100253|
|     89|    11| 2.9255598|
|     12|    11| 2.8602285|
|     75|    11| 2.2314317|
|     22|    11| 2.0400825|
|     70|    11| 1.9973627|
|     18|    11| 1.8517807|
|      9|    11| 1.7544389|
|     48|    11| 1.4507219|
|     11|    11| 1.1938767|
|     67|    11| 1.1186584|
|     47|    11| 1.0907099|
|     51|    11|0.27295026|
|     78|    11|0.14152023|
|     43|    11|-0.8675617|
+-------+------+----------+

