In [1]:
import findspark
findspark.init('/home/sak/spark-2.4.3-bin-hadoop2.7')

import pyspark

from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("sajad").setMaster("local")
sc = SparkContext(conf=conf)


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row


In [5]:
lines = spark.read.text("sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)
# $example off$
userRecs.show()
movieRecs.show()
userSubsetRecs.show()
movieSubSetRecs.show()

    

Root-mean-square error = 1.6638709815338888
+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    28|[[34, 6.2745895],...|
|    26|[[94, 5.3556576],...|
|    27|[[32, 4.5518517],...|
|    12|[[19, 5.7531867],...|
|    22|[[44, 5.652228], ...|
|     1|[[52, 4.6947684],...|
|    13|[[93, 3.8497524],...|
|     6|[[25, 4.4996967],...|
|    16|[[51, 4.749304], ...|
|     3|[[62, 5.261743], ...|
|    20|[[90, 6.108018], ...|
|     5|[[55, 4.80991], [...|
|    19|[[32, 3.8459191],...|
|    15|[[46, 4.9723687],...|
|    17|[[46, 5.135776], ...|
|     9|[[49, 5.1126733],...|
|     4|[[52, 3.8264365],...|
|     8|[[52, 5.0733185],...|
|    23|[[90, 6.6885557],...|
|     7|[[25, 5.151322], ...|
+------+--------------------+
only showing top 20 rows

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     31|[[12, 3.7749054],...|
|     85|[[8, 4.8402247], ...|
|     65|[[23, 4.866437], ...|
|     53|[[8, 4.883639],