In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark import SparkContext,SparkConf
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.getOrCreate()
rdd=spark.read.csv("ratings.csv", header='True', inferSchema='True')

In [3]:
rdd.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [4]:
traning, test = rdd.randomSplit([0.8, 0.2])

In [5]:
als = ALS(maxIter=20, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(traning)

In [6]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("root-mean-square error: "+str(rmse))

                                                                                

root-mean-square error: 1.125133496097397


In [7]:
recommends = model.recommendForAllUsers(5)

In [8]:
recommends.select("userId", "recommendations", size("recommendations").alias("size")).show()



+------+--------------------+----+
|userId|     recommendations|size|
+------+--------------------+----+
|     1|[{56145, 7.753439...|   5|
|     2|[{2879, 8.573591}...|   5|
|     3|[{6857, 6.910298}...|   5|
|     4|[{102903, 9.76545...|   5|
|     5|[{52435, 9.968112...|   5|
|     6|[{3036, 7.4586525...|   5|
|     7|[{1194, 9.910393}...|   5|
|     8|[{61240, 6.889530...|   5|
|     9|[{179819, 8.71759...|   5|
|    10|[{3089, 7.846598}...|   5|
|    11|[{103141, 7.21155...|   5|
|    12|[{37384, 8.921238...|   5|
|    13|[{95182, 9.114754...|   5|
|    14|[{5784, 8.107646}...|   5|
|    15|[{37727, 8.379441...|   5|
|    16|[{2517, 6.7077703...|   5|
|    17|[{1957, 6.042229}...|   5|
|    18|[{1217, 4.9526772...|   5|
|    19|[{7360, 5.2648497...|   5|
|    20|[{3477, 7.593833}...|   5|
+------+--------------------+----+
only showing top 20 rows



                                                                                

In [9]:
recommends.take(1)

                                                                                

[Row(userId=1, recommendations=[Row(movieId=56145, rating=7.753439426422119), Row(movieId=1262, rating=6.679389476776123), Row(movieId=1203, rating=6.51816463470459), Row(movieId=3089, rating=6.423007011413574), Row(movieId=599, rating=6.377112865447998)])]