In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import col
import matplotlib.pyplot as plt

def create_spark_session(app_name="MovieLensALS"):
    """Create and return a Spark session with optimized memory configuration"""
    return SparkSession.builder \
        .appName(app_name) \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.memory.offHeap.enabled", "true") \
        .config("spark.memory.offHeap.size", "4g") \
        .config("spark.sql.shuffle.partitions", "100") \
        .config("spark.default.parallelism", "100") \
        .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
        .config("spark.executor.cores", "4") \
        .config("spark.driver.maxResultSize", "2g") \
        .config("spark.kryoserializer.buffer.max", "1024m") \
        .config("spark.rdd.compress", "true") \
        .config("spark.shuffle.compress", "true") \
        .config("LogLevel", "ERROR") \
        .getOrCreate()

def load_ratings_data(spark, filepath):
    """Load and preprocess ratings data with caching"""
    ratings_df = spark.read.csv(
        filepath,
        header=True,
        inferSchema=True
    ).repartition(100)  # Increased partitions
    
    # Convert columns to appropriate types and cache the result
    processed_df = ratings_df.select(
        col("userId").cast("integer"),
        col("movieId").cast("integer"),
        col("rating").cast("float")
    ).cache()  # Cache the DataFrame
    
    # Force cache computation
    processed_df.count()
    
    return processed_df

def train_als_model(training_data, validation_data):
    """Train ALS model with cross-validation for hyperparameter tuning"""
    
    # Initialize ALS model
    als = ALS(
        userCol="userId",
        itemCol="movieId",
        ratingCol="rating",
        nonnegative=True,
        coldStartStrategy="drop",
        intermediateStorageLevel="MEMORY_AND_DISK",
        finalStorageLevel="MEMORY_AND_DISK"
    )
    
    # Create parameter grid for cross-validation
    param_grid = ParamGridBuilder() \
        .addGrid(als.rank, [10,20]) \
        .addGrid(als.maxIter, [5,15]) \
        .addGrid(als.regParam, [0.01,0.1]) \
        .build()
    
    # Define evaluator
    evaluator = RegressionEvaluator(
        metricName="rmse",
        labelCol="rating",
        predictionCol="prediction"
    )
    
    # Create CrossValidator
    cv = CrossValidator(
        estimator=als,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=2,
        parallelism=2
    )
    
    # Fit the model using cross-validation
    print("Starting model training...")
    cv_model = cv.fit(training_data)
    
    # Get the best model
    best_model = cv_model.bestModel
    
    # Calculate validation metrics
    predictions = best_model.transform(validation_data)
    rmse = evaluator.evaluate(predictions)
    
    # Print best parameters and validation RMSE
    print("\nBest Model Parameters:")
    # print(f"Rank: {best_model.getRank()}")
    # print(f"MaxIter: {best_model.getMaxIter()}")
    # print(f"RegParam: {best_model.getRegParam()}")
    print(f"Rank: {best_model._java_obj.parent().getRank()}")
    print(f"MaxIter: {best_model._java_obj.parent().getMaxIter()}")
    print(f"RegParam: {best_model._java_obj.parent().getRegParam()}")
    print(f"Validation RMSE: {rmse}")
    
    return best_model, rmse

In [2]:
# Create Spark session
spark = create_spark_session()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/22 22:39:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load ratings data
ratings_df = load_ratings_data(spark, "data/ratings.csv")

# # Take a smaller sample for initial testing
# sampled_df = ratings_df.sample(fraction=0.1, seed=42)

# Split data into training (80%) and validation (20%) sets
training_data, validation_data = ratings_df.randomSplit([0.8, 0.2], seed=42)

print("Training data count:", training_data.count())
print("Validation data count:", validation_data.count())

 # Cache the datasets
training_data.cache()
validation_data.cache()

                                                                                

Training data count: 19999186


                                                                                

Validation data count: 5000909


DataFrame[userId: int, movieId: int, rating: float]

In [4]:
# Train model and get validation results
best_model, validation_rmse = train_als_model(training_data, validation_data)

# Save the model for later use
# best_model.save("models/best_als_model")
best_model.write().overwrite().save("models/best_als_model")

print("\nModel training completed and saved successfully!")


Starting model training...


24/11/22 22:40:07 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                


Best Model Parameters:
Rank: 20
MaxIter: 15
RegParam: 0.1
Validation RMSE: 0.8071295973063569


                                                                                


Model training completed and saved successfully!


In [5]:
# Unpersist cached DataFrames
if 'training_data' in locals():
    training_data.unpersist()
if 'validation_data' in locals():
    validation_data.unpersist()
# Stop Spark session
spark.stop()

In [None]:

# def main():
#     # Create Spark session
#     spark = create_spark_session()
    
#     try:
#         # Load ratings data
#         ratings_df = load_ratings_data(spark, "data/ratings.csv")
        
#         # Split data into training (80%) and validation (20%) sets
#         training_data, validation_data = ratings_df.randomSplit([0.8, 0.2], seed=42)
        
#         print("Training data count:", training_data.count())
#         print("Validation data count:", validation_data.count())
        
#         # Train model and get validation results
#         best_model, validation_rmse = train_als_model(training_data, validation_data)
        
#         # Save the model for later use
#         best_model.save("models/best_als_model")
        
#         print("\nModel training completed and saved successfully!")
        
#     finally:
#         # Stop Spark session
#         spark.stop()

# if __name__ == "__main__":
#     main()