In [1]:
import findspark
findspark.init()
import os
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
from dotenv import load_dotenv

load_dotenv()

# Create a Spark session
spark = SparkSession.builder.appName("MovieRatings").getOrCreate()

def get_rating_data():
    schema = StructType([
        StructField("userId", IntegerType(), True),
        StructField("movieId", IntegerType(), True),
        StructField("rating", FloatType(), True),
        StructField("timestamp", IntegerType(), True)
    ])
    data = spark.read.csv(os.getenv('BASE_PROJECT_PATH') + 'data/u.data', sep='\t', schema=schema, header=False)
    return data

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

# Load the data
data = get_rating_data()

# Split the data into training and testing sets
(training, test) = data.randomSplit([0.8, 0.2], seed=42)

# Build the ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")

# Define a parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(als.rank, [20, 30, 40]) \
    .addGrid(als.maxIter, [10, 15, 20]) \
    .addGrid(als.regParam, [0.05, 0.1, 0.2]) \
    .addGrid(als.alpha, [1.0, 5.0, 10.0]) \
    .build()

# Define an evaluators 
evaluator_rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
evaluator_r2 = RegressionEvaluator(metricName="r2", labelCol="rating", predictionCol="prediction")
evaluator_mse = RegressionEvaluator(metricName="mse", labelCol="rating", predictionCol="prediction")
evaluator_accuracy = RegressionEvaluator(metricName="accuracy", labelCol="rating", predictionCol="prediction")

# Use TrainValidationSplit to choose the best combination of parameters
tvs = TrainValidationSplit(estimator=als,
                           estimatorParamMaps=param_grid,
                           evaluator=evaluator_rmse,
                           trainRatio=0.8)

# Train the model
model = tvs.fit(training)

# Make predictions on the test set
predictions = model.transform(test)

# Evaluate the model
rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)
mse = evaluator_mse.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data =", rmse)
print("R Squared (R2) on test data =", r2)
print("Mean Squared Error (MSE) on test data =", mse)

Root Mean Squared Error (RMSE) on test data = 0.9157990063912332
R Squared (R2) on test data = 0.33969955154459863
Mean Squared Error (MSE) on test data = 0.8386878201071701


In [5]:
# Save the model to a new path
model.bestModel.write().save(os.getenv('BASE_PROJECT_PATH') + 'recommendation/best_model_als')

In [6]:
# load the model
from pyspark.ml.recommendation import ALSModel

model1 = ALSModel.load(os.getenv('BASE_PROJECT_PATH') + 'recommendation/best_model_als')

# Make recommendations for users
userRecs = model1.recommendForAllUsers(10)

print(userRecs.show(10, False))



+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                      |
+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1     |[{114, 5.0185556}, {408, 4.9068847}, {169, 4.83205}, {1449, 4.8184934}, {285, 4.8016534}, {50, 4.7903686}, {647, 4.7859}, {223, 4.7706037}, {963, 4.738607}, {48, 4.724719}]         |
|2     |[{1643, 4.866243}, {1449, 4.8260517}, {483, 4.7573133}, {694, 4.691554}, {127, 4.680705}, {1194, 4.6368113}, {169, 4.6052384}, {285, 4.5930123}, {50, 4.5888834}, {178, 4.5639105}]  |
|3     |[{320, 4.6748524}, {340, 4.049389}, {