In [1]:
from pyspark.sql import SparkSession  
spark = SparkSession \
    .builder \
    .appName("recommendation") \
    .config("master", "local[*]") \
    .config("spark.driver.memory", "5G") \
    .config("spark.driver.memory", "5G") \
    .getOrCreate()


In [2]:
book_rating = spark.read.csv("ratings.csv", header=True, inferSchema = True).withColumnRenamed("user_id", "user").withColumnRenamed("book_id", "item")

In [3]:
train, test = book_rating.randomSplit([0.7, 0.3], seed=12345)

In [4]:
book_rating.limit(10).toPandas()

Unnamed: 0,item,user,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4
5,1,2077,4
6,1,2487,4
7,1,2900,5
8,1,3662,4
9,1,3922,5


In [5]:
from pyspark.ml.recommendation import ALS
als = ALS(rank=100,maxIter=5,regParam=0.09,coldStartStrategy="drop",nonnegative=True)
model = als.fit(train)

In [6]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator=RegressionEvaluator(metricName="r2",labelCol="rating",predictionCol="prediction")

In [7]:
predictions=model.transform(test)
evaluator.evaluate(predictions)

0.19279127574383292

In [8]:
model.recommendForAllUsers(1).limit(10).toPandas()

Unnamed: 0,user,recommendations
0,148,"[(7639, 4.965629577636719)]"
1,463,"[(2975, 4.518317699432373)]"
2,471,"[(8946, 3.1113173961639404)]"
3,496,"[(7639, 5.565012454986572)]"
4,833,"[(8946, 5.095363140106201)]"
5,1088,"[(8831, 4.919075012207031)]"
6,1342,"[(9553, 5.1459197998046875)]"
7,1580,"[(8831, 5.183160781860352)]"
8,1645,"[(9553, 4.6910176277160645)]"
9,1829,"[(7639, 4.932900428771973)]"


In [9]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

paramGrid = ParamGridBuilder() \
    .addGrid(als.regParam, [0.1, 0.09]) \
    .addGrid(als.maxIter, [10, 12]) \
    .addGrid(als.rank, [20,50,100]) \
    .build()

crossval = CrossValidator(estimator=als,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(metricName="r2",labelCol="rating",predictionCol="prediction"),
                          numFolds=10)

cvModel = crossval.fit(train)



In [10]:
bestModel=cvModel.bestModel

In [11]:
predictions=bestModel.transform(test)
print("R2 for the best model using ALS {}".format(evaluator.evaluate(predictions)))

R2 for the best model using ALS 0.20448434720694908
