In [1]:
%config Completer.use_jedi = False

from pyspark.sql import SparkSession
import numpy
import pandas

import os
os.environ['PYSPARK_PYTHON'] = '/var/www/py_spark_ccf/PY_SPARK_CCF_ENV/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/var/www/py_spark_ccf/PY_SPARK_CCF_ENV/bin/python3'
os.getcwd()

'/var/www/py_spark_ccf/notebooks'

In [2]:
spark_session = SparkSession.builder.master("spark://costrategix-pc:7077")\
    .appName('movie_reccomendation_system').getOrCreate()

In [3]:
spark_session.sparkContext.getConf().getAll()

[('spark.driver.memory', '18g'),
 ('spark.app.startTime', '1617509746582'),
 ('spark.app.name', 'movie_reccomendation_system'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.port', '39077'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'spark://costrategix-pc:7077'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'app-20210404094547-0002'),
 ('spark.driver.host', '192.168.0.7'),
 ('spark.ui.showConsoleProgress', 'true')]

In [4]:
ratings_data_frame = spark_session.read.csv('../data/ratings.csv', inferSchema=True, header=True)

In [5]:
ratings_data_frame.count()

26024289

In [6]:
ratings_data_frame.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [7]:
ratings_data_frame.show(vertical=True, n=5)

-RECORD 0---------------
 userId    | 1          
 movieId   | 110        
 rating    | 1.0        
 timestamp | 1425941529 
-RECORD 1---------------
 userId    | 1          
 movieId   | 147        
 rating    | 4.5        
 timestamp | 1425942435 
-RECORD 2---------------
 userId    | 1          
 movieId   | 858        
 rating    | 5.0        
 timestamp | 1425941523 
-RECORD 3---------------
 userId    | 1          
 movieId   | 1221       
 rating    | 5.0        
 timestamp | 1425941546 
-RECORD 4---------------
 userId    | 1          
 movieId   | 1246       
 rating    | 5.0        
 timestamp | 1425941556 
only showing top 5 rows



In [8]:
ratings_data_frame = ratings_data_frame.dropna()
ratings_data_frame.count()

26024289

In [9]:
ratings_data_frame.select('userId').distinct().count()

270896

In [10]:
ratings_data_frame.select('movieId').distinct().count()

45115

In [11]:
ratings_data_frame.createOrReplaceTempView("table1")
spark_session.sql("""
select movieId from table1
group by movieId having count(*) > 10000;
""").count()

636

In [12]:
movie_row_list = spark_session.sql("""
select movieId from table1
group by movieId having count(*) > 10000;
""").collect()

movie_list = [row['movieId'] for row in movie_row_list]

In [13]:
ratings_data_frame = ratings_data_frame.filter(ratings_data_frame['movieId'].isin(movie_list))

In [14]:
ratings_data_frame.count()

13202956

In [15]:
spark_session.sql("""
select userId from table1
group by userId having count(*) > 1000;
""").count()

2502

In [16]:
user_row_list = spark_session.sql("""
select userId from table1
group by userId having count(*) > 1000;
""").collect()

user_list = [row['userId'] for row in user_row_list]

In [17]:
ratings_data_frame = ratings_data_frame.filter(ratings_data_frame['userId'].isin(user_list))

In [18]:
ratings_data_frame.count()

995218

In [19]:
train_data, test_data = ratings_data_frame.randomSplit([0.7, 0.3])

In [20]:
from pyspark.ml.recommendation import ALS
model = ALS(maxIter=10, userCol="userId", itemCol="movieId", ratingCol="rating")

In [21]:
model = model.fit(train_data)

In [22]:
test_data.head(1)

[Row(userId=229, movieId=2, rating=3.0, timestamp=1037827385)]

In [23]:
test_user_data = test_data.filter(test_data['userId'] == 229)

In [24]:
test_user_data.collect()

[Row(userId=229, movieId=2, rating=3.0, timestamp=1037827385),
 Row(userId=229, movieId=16, rating=3.0, timestamp=1037829658),
 Row(userId=229, movieId=47, rating=2.0, timestamp=1037136477),
 Row(userId=229, movieId=50, rating=4.0, timestamp=1037825980),
 Row(userId=229, movieId=104, rating=2.0, timestamp=1037827898),
 Row(userId=229, movieId=172, rating=3.0, timestamp=1039723754),
 Row(userId=229, movieId=173, rating=2.0, timestamp=1037831665),
 Row(userId=229, movieId=185, rating=2.0, timestamp=1037828727),
 Row(userId=229, movieId=231, rating=2.0, timestamp=1037037105),
 Row(userId=229, movieId=235, rating=3.0, timestamp=1037829509),
 Row(userId=229, movieId=253, rating=2.0, timestamp=1037827711),
 Row(userId=229, movieId=260, rating=3.0, timestamp=1037128254),
 Row(userId=229, movieId=277, rating=1.0, timestamp=1037831280),
 Row(userId=229, movieId=292, rating=3.0, timestamp=1037828596),
 Row(userId=229, movieId=293, rating=5.0, timestamp=1037826148),
 Row(userId=229, movieId=296, 

In [25]:
single_user = test_user_data.select(['movieId','userId'])

In [26]:
reccomendations = model.transform(single_user)
reccomendations.orderBy('movieId').collect()

[Row(movieId=2, userId=229, prediction=1.9873344898223877),
 Row(movieId=16, userId=229, prediction=2.994056463241577),
 Row(movieId=47, userId=229, prediction=3.1423003673553467),
 Row(movieId=50, userId=229, prediction=3.239253044128418),
 Row(movieId=104, userId=229, prediction=2.208995819091797),
 Row(movieId=172, userId=229, prediction=1.7231072187423706),
 Row(movieId=173, userId=229, prediction=1.449049949645996),
 Row(movieId=185, userId=229, prediction=1.670841932296753),
 Row(movieId=231, userId=229, prediction=2.25760555267334),
 Row(movieId=235, userId=229, prediction=3.266620397567749),
 Row(movieId=253, userId=229, prediction=2.5180413722991943),
 Row(movieId=260, userId=229, prediction=3.2347428798675537),
 Row(movieId=277, userId=229, prediction=1.9386329650878906),
 Row(movieId=292, userId=229, prediction=2.044633150100708),
 Row(movieId=293, userId=229, prediction=3.1543996334075928),
 Row(movieId=296, userId=229, prediction=3.5510425567626953),
 Row(movieId=300, user

In [27]:
from pyspark.ml.evaluation import RegressionEvaluator

In [28]:
test_data.count()

298859

In [29]:
test_results = model.transform(test_data)

In [30]:
test_results.head(5)

[Row(userId=5300, movieId=471, rating=4.5, timestamp=1181189253, prediction=3.4351301193237305),
 Row(userId=156296, movieId=471, rating=4.0, timestamp=1498958617, prediction=3.395777702331543),
 Row(userId=193640, movieId=471, rating=4.0, timestamp=1323546219, prediction=3.223991632461548),
 Row(userId=7417, movieId=471, rating=4.0, timestamp=1077029821, prediction=3.284681797027588),
 Row(userId=82966, movieId=471, rating=4.0, timestamp=1113240514, prediction=3.6921091079711914)]

In [31]:
evaluator = RegressionEvaluator(labelCol='rating', predictionCol='prediction')
print('RMSE')
evaluator.evaluate(test_results)

RMSE


0.7862512331487455

In [32]:
print('R_sqr')
evaluator.evaluate(test_results, {evaluator.metricName: "r2"})

R_sqr


0.39399185926999203

In [33]:
print('MAE')
evaluator.evaluate(test_results, {evaluator.metricName: "mae"})

MAE


0.614926330628965

In [34]:
test_data.select('rating').describe().show()

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|            298859|
|   mean|3.4966271720108812|
| stddev|1.0100034030299874|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+

