In [None]:
%config Completer.use_jedi = False

from pyspark.sql import SparkSession
import numpy
import pandas

import os
os.environ['PYSPARK_PYTHON'] = '/var/www/py_spark_ccf/PY_SPARK_CCF_ENV/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/var/www/py_spark_ccf/PY_SPARK_CCF_ENV/bin/python3'
os.getcwd()

In [None]:
spark_session = SparkSession.builder.master("spark://costrategix-pc:7077")\
    .appName('movie_reccomendation_system').getOrCreate()

In [None]:
spark_session.sparkContext.getConf().getAll()

In [None]:
ratings_data_frame = spark_session.read.csv('../data/ratings.csv', inferSchema=True, header=True)

In [None]:
ratings_data_frame.count()

In [None]:
ratings_data_frame.printSchema()

In [None]:
ratings_data_frame.show(vertical=True, n=5)

In [None]:
ratings_data_frame = ratings_data_frame.dropna()
ratings_data_frame.count()

In [None]:
ratings_data_frame.select('userId').distinct().count()

In [None]:
ratings_data_frame.select('movieId').distinct().count()

In [None]:
ratings_data_frame.createOrReplaceTempView("table1")
spark_session.sql("""
select movieId from table1
group by movieId having count(*) > 10000;
""").count()

In [None]:
movie_row_list = spark_session.sql("""
select movieId from table1
group by movieId having count(*) > 10000;
""").collect()

movie_list = [row['movieId'] for row in movie_row_list]

In [None]:
ratings_data_frame = ratings_data_frame.filter(ratings_data_frame['movieId'].isin(movie_list))

In [None]:
ratings_data_frame.count()

In [None]:
spark_session.sql("""
select userId from table1
group by userId having count(*) > 1000;
""").count()

In [None]:
user_row_list = spark_session.sql("""
select userId from table1
group by userId having count(*) > 1000;
""").collect()

user_list = [row['userId'] for row in user_row_list]

In [None]:
ratings_data_frame = ratings_data_frame.filter(ratings_data_frame['userId'].isin(user_list))

In [None]:
ratings_data_frame.count()

In [None]:
train_data, test_data = ratings_data_frame.randomSplit([0.7, 0.3])

In [None]:
from pyspark.ml.recommendation import ALS
model = ALS(maxIter=10, userCol="userId", itemCol="movieId", ratingCol="rating")

In [None]:
model = model.fit(train_data)

In [None]:
test_data.head(1)

In [None]:
test_user_data = test_data.filter(test_data['userId'] == 229)

In [None]:
test_user_data.collect()

In [None]:
single_user = test_user_data.select(['movieId','userId'])

In [None]:
reccomendations = model.transform(single_user)
reccomendations.orderBy('movieId').collect()

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
test_data.count()

In [None]:
test_results = model.transform(test_data)

In [None]:
test_results.head(5)

In [None]:
evaluator = RegressionEvaluator(labelCol='rating', predictionCol='prediction')
print('RMSE')
evaluator.evaluate(test_results)

In [None]:
print('R_sqr')
evaluator.evaluate(test_results, {evaluator.metricName: "r2"})

In [None]:
print('MAE')
evaluator.evaluate(test_results, {evaluator.metricName: "mae"})

In [None]:
test_data.select('rating').describe().show()