In [1]:
pip install findspark pandas pyspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=aa5f236a2202e1a698fbeaee740923d07e341ba9a01a31fbc8739099e35c1ede
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: findspark, pyspark
Successfully installed findspark-2.0.1 pyspark-3.5.0


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

# Create a Spark session
spark = SparkSession.builder.appName("MovieRatings").getOrCreate()

def get_rating_data():
    schema = StructType([
        StructField("userId", IntegerType(), True),
        StructField("movieId", IntegerType(), True),
        StructField("rating", FloatType(), True),
        StructField("timestamp", IntegerType(), True)
    ])
    data = spark.read.csv('u.data', sep='\t', schema=schema, header=False)
    return data

In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

# Load the data
data = get_rating_data()

# Create a smaller subset for demonstration (you can skip this if you want to use the entire dataset)
data = data.limit(1000)

# Split the data into training and testing sets
(training, test) = data.randomSplit([0.8, 0.2], seed=42)

# Build the ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")

# Define a parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 20, 30]) \
    .addGrid(als.maxIter, [5, 10, 15]) \
    .addGrid(als.regParam, [0.01, 0.1, 1.0]) \
    .build()

# Define an evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

# Use TrainValidationSplit to choose the best combination of parameters
tvs = TrainValidationSplit(estimator=als,
                           estimatorParamMaps=param_grid,
                           evaluator=evaluator,
                           trainRatio=0.8)

# Train the model
model = tvs.fit(training)

# Make predictions on the test set
predictions = model.transform(test)

# Evaluate the model
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data =", rmse)

Root Mean Squared Error (RMSE) on test data = 2.4760566242459876


In [4]:
import os
print(os.getcwd())

/content


In [5]:
model_path = "/content/models"
model.save(model_path)

In [6]:
from google.colab import files

# Zip the model directory
!zip -r my_saved_model.zip {model_path}

# Download the zip file
files.download("my_saved_model.zip")


  adding: content/models/ (stored 0%)
  adding: content/models/estimator/ (stored 0%)
  adding: content/models/estimator/metadata/ (stored 0%)
  adding: content/models/estimator/metadata/._SUCCESS.crc (stored 0%)
  adding: content/models/estimator/metadata/_SUCCESS (stored 0%)
  adding: content/models/estimator/metadata/part-00000 (deflated 42%)
  adding: content/models/estimator/metadata/.part-00000.crc (stored 0%)
  adding: content/models/evaluator/ (stored 0%)
  adding: content/models/evaluator/metadata/ (stored 0%)
  adding: content/models/evaluator/metadata/._SUCCESS.crc (stored 0%)
  adding: content/models/evaluator/metadata/_SUCCESS (stored 0%)
  adding: content/models/evaluator/metadata/part-00000 (deflated 42%)
  adding: content/models/evaluator/metadata/.part-00000.crc (stored 0%)
  adding: content/models/metadata/ (stored 0%)
  adding: content/models/metadata/._SUCCESS.crc (stored 0%)
  adding: content/models/metadata/_SUCCESS (stored 0%)
  adding: content/models/metadata/pa

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
# Assuming you have a trained ALS model named 'model'
model_path = "/content/model"

# Save the model in Parquet format
model.write().overwrite().save(model_path)


In [8]:
import shutil

# Zip the model directory
shutil.make_archive("/content/model_archive", 'zip', model_path)

'/content/model_archive.zip'

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

# Load the data
data = get_rating_data()

# Create a smaller subset for demonstration (you can skip this if you want to use the entire dataset)
data = data.limit(1000)

# Split the data into training and testing sets
(training, test) = data.randomSplit([0.8, 0.2], seed=42)

# Build the ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")

# Define a parameter grid for hyperparameter tuning
param_grid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 20, 30]) \
    .addGrid(als.maxIter, [5, 10, 15]) \
    .addGrid(als.regParam, [0.01, 0.1, 1.0]) \
    .build()

# Define an evaluators
evaluator_rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
evaluator_r2 = RegressionEvaluator(metricName="r2", labelCol="rating", predictionCol="prediction")
evaluator_mse = RegressionEvaluator(metricName="mse", labelCol="rating", predictionCol="prediction")
evaluator_accuracy = RegressionEvaluator(metricName="accuracy", labelCol="rating", predictionCol="prediction")

# Use TrainValidationSplit to choose the best combination of parameters
tvs = TrainValidationSplit(estimator=als,
                           estimatorParamMaps=param_grid,
                           evaluator=evaluator,
                           trainRatio=0.8)

# Train the model
model = tvs.fit(training)

# Make predictions on the test set
predictions = model.transform(test)


In [10]:
# Evaluate the model
rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)
mse = evaluator_mse.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data =", rmse)
print("R Squared (R2) on test data =", r2)
print("Mean Squared Error (MSE) on test data =", mse)

Root Mean Squared Error (RMSE) on test data = 2.4760566242459876
R Squared (R2) on test data = -4.119053385468169
Mean Squared Error (MSE) on test data = 6.130856406472435
