In [15]:
import findspark
findspark.init()
import os
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
from dotenv import load_dotenv

load_dotenv()

def get_rating_data():
    spark.close()
    # Create a Spark session
    spark = SparkSession.builder.appName("MovieRatings").getOrCreate()
    schema = StructType([
        StructField("userId", IntegerType(), True),
        StructField("movieId", IntegerType(), True),
        StructField("rating", FloatType(), True),
        StructField("timestamp", IntegerType(), True)
    ])
    data = spark.read.csv(os.getenv('BASE_PROJECT_PATH') + 'data/u.data', sep='\t', schema=schema, header=False)
    return data

In [16]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

# Load the data
data = get_rating_data()

# Create a smaller subset for demonstration (you can skip this if you want to use the entire dataset)
data = data.limit(1000)

# Split the data into training and testing sets
(training, test) = data.randomSplit([0.8, 0.2], seed=42)

# Build the ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")

# Define a parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 20, 30]) \
    .addGrid(als.maxIter, [5, 10, 15]) \
    .addGrid(als.regParam, [0.01, 0.1, 1.0]) \
    .build()

# Define an evaluators 
evaluator_rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
evaluator_r2 = RegressionEvaluator(metricName="r2", labelCol="rating", predictionCol="prediction")
evaluator_mse = RegressionEvaluator(metricName="mse", labelCol="rating", predictionCol="prediction")
evaluator_accuracy = RegressionEvaluator(metricName="accuracy", labelCol="rating", predictionCol="prediction")

# Use TrainValidationSplit to choose the best combination of parameters
tvs = TrainValidationSplit(estimator=als,
                           estimatorParamMaps=param_grid,
                           evaluator=evaluator,
                           trainRatio=0.8)

# Train the model
model = tvs.fit(training)

# Make predictions on the test set
predictions = model.transform(test)

# Evaluate the model
rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)
mse = evaluator_mse.evaluate(predictions)
accuracy = evaluator_accuracy.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data =", rmse)
print("R Squared (R2) on test data =", r2)
print("Mean Squared Error (MSE) on test data =", mse)
print("Accuracy on test data =", accuracy)

UnboundLocalError: local variable 'spark' referenced before assignment

In [6]:

from pyspark.ml.recommendation import ALSModel

# Specify the path where the model is saved
local_model_path = "c:/Users/YouCode/Desktop/saad/2023-12-04_evaluation_JayZZ_movies_recommendation_spark_EK/Jay-Z_Entertainment_data_analysis_and_live_recommendation/recommendation/model"

# Load the saved ALS model
loaded_model = ALSModel.load(local_model_path)


Py4JJavaError: An error occurred while calling o55.load.
: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/c:/Users/YouCode/Desktop/saad/2023-12-04_evaluation_JayZZ_movies_recommendation_spark_EK/Jay-Z_Entertainment_data_analysis_and_live_recommendation/recommendation/model/metadata
	at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287)
	at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229)
	at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315)
	at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:205)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:300)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:296)
	at org.apache.spark.rdd.RDD.$anonfun$take$1(RDD.scala:1428)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1422)
	at org.apache.spark.rdd.RDD.$anonfun$first$1(RDD.scala:1463)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.first(RDD.scala:1463)
	at org.apache.spark.ml.util.DefaultParamsReader$.loadMetadata(ReadWrite.scala:587)
	at org.apache.spark.ml.recommendation.ALSModel$ALSModelReader.load(ALS.scala:553)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)


In [5]:
import os
print(os.getcwd())

c:\Users\YouCode\Desktop\saad\2023-12-04_evaluation_JayZZ_movies_recommendation_spark_EK\Jay-Z_Entertainment_data_analysis_and_live_recommendation\recommendation
