In [19]:
from pyspark.sql import SparkSession

sp = SparkSession.builder.appName('a').getOrCreate()

data = sp.read.json('movies.json')
data.printSchema()

[Stage 1374:===>                                                  (1 + 13) / 14]

root
 |-- helpfulness: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- profile_name: string (nullable = true)
 |-- review: string (nullable = true)
 |-- score: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)



                                                                                

In [21]:
import pyspark.sql.functions as F
from pyspark.sql.types import LongType

getHash = F.udf(lambda x: int(x, 16) % (10 ** 8), LongType())

data = data.select('product_id', 'user_id', 'score')
ndata = data.withColumn('product_id', getHash(F.sha1(F.col('product_id').cast('string')))).withColumn('user_id', getHash(F.sha1(F.col('user_id').cast('string')))).withColumn('score', F.col('score').cast('int'))
train, test = ndata.randomSplit([0.9, 0.1])
train.cache()
test.cache()

DataFrame[product_id: bigint, user_id: bigint, score: int]

In [None]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

als = ALS(userCol = 'user_id', itemCol = 'product_id', ratingCol = 'score')
model = als.fit(train)
pred = model.transform(test)
evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'score', predictionCol = 'prediction')
param = ParamGridBuilder().addGrid(als.rank, [1, 5]), addGrid(als.alpha, [1]).addGrid(als.regParam, [0.05]).addGrid(als.maxIter, [10]).build()
cross = CrossValidator(estimator = als, evaluator = evaluator, estimatorParamMaps = param, numFolds