In [0]:
display(dbutils.fs.ls('/databricks-datasets/cs110x/ml-1m/data-001/'))

In [0]:
display(dbutils.fs.ls('/databricks-datasets/cs110x/ml-20m/data-001/'))

In [0]:

%fs head /databricks-datasets/cs110x/ml-20m/data-001/ratings.csv

In [0]:
%fs head /databricks-datasets/cs110x/ml-1m/data-001/movies.dat

In [0]:
from pyspark.sql.types import *

movies_schema = StructType([
  StructField('movieId', IntegerType()),
  StructField('title', StringType()),
  StructField('genres', StringType())
])
ratings_schema = StructType([
  StructField('userId', IntegerType()),
  StructField('movieId', IntegerType()),
  StructField('ratings', FloatType())  
])

In [0]:
file_location = "/databricks-datasets/cs110x/ml-1m/data-001/movies.dat"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "false"
delimiter = "::"

# The applied options are for CSV files. For other file types, these will be ignored.
df_movies = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .schema(movies_schema) \
  .load(file_location)

display(df_movies)

In [0]:
file_location = "/databricks-datasets/cs110x/ml-1m/data-001/ratings.dat"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "false"
delimiter = "::"

# The applied options are for CSV files. For other file types, these will be ignored.
df_ratings = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .schema(ratings_schema) \
  .load(file_location)

display(df_ratings)

In [0]:
df_ratings.select('ratings').describe().show()

In [0]:
df_ratings_train, df_ratings_test = df_ratings.randomSplit([0.8, 0.2], seed=42)
#display(df_ratings_train)
print(df_ratings_train.count())
print(df_ratings_test.count())

In [0]:
from pyspark.ml.recommendation import ALS

als = ALS(rank=5, 
          maxIter=5,
          regParam=0.1, 
          userCol="userId",
          itemCol="movieId",
          ratingCol="ratings")         
          #implicitPrefs=False

model = als.fit(df_ratings_train)

In [0]:
from pyspark.sql.functions import round, col

df_predicted_ratings = model.transform(df_ratings_test)
df_predicted_ratings = df_predicted_ratings.filter(df_predicted_ratings.prediction != float('nan'))
df_predicted_ratings.select('ratings', 'prediction').describe().show()
#df_predicted_ratings = df_predicted_ratings.withColumn('prediction rounded', round(col('prediction'), 1))

display(df_predicted_ratings)

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName="rmse", labelCol="ratings", predictionCol="prediction")
rmse = evaluator.evaluate(df_predicted_ratings)
print("Root-mean-square error = " + str(rmse))

In [0]:
check_user_id = df_ratings.filter(df_ratings.userId == 0)
check_user_id.show()

In [0]:
my_user_id = 0
my_rated_movies = [
    (my_user_id, 318, 3), # Shawshank redemption
    (my_user_id, 908, 4), # North by Northwest (1959)
    (my_user_id, 858, 5), # Godfather, The (1972)
    (my_user_id, 2019, 4), # Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
    (my_user_id, 912, 4), # Casablanca (1942)
    (my_user_id, 1250, 5), # Bridge on the River Kwai, The (1957)
    (my_user_id, 2324, 5), # Life Is Beautiful (La Vita ? bella) (1997)
    (my_user_id, 1233, 5), # Boat, The (Das Boot) (1981)
    (my_user_id, 593, 4), # Silence of the Lambs, The (1991)
    (my_user_id, 1262, 4), # Great Escape, The (1963)
     # The format of each line is (my_user_id, movie ID, your rating)
     # For example, to give the movie "Star Wars: Episode IV - A New Hope (1977)" a five rating, you would add the following line:
     #   (my_user_id, 260, 5),
]
print(my_rated_movies)

In [0]:
df_custom_ratings = spark.createDataFrame(my_rated_movies, ["userId", "movieId", "ratings"])
display(df_custom_ratings)

In [0]:
df_all_ratings = df_ratings_train.union(df_custom_ratings)
print(df_all_ratings.count())
display(df_all_ratings)

In [0]:
from pyspark.ml.recommendation import ALS

als = ALS(rank=5, 
          maxIter=5,
          regParam=0.1, 
          userCol="userId",
          itemCol="movieId",
          ratingCol="ratings")         
          #implicitPrefs=False

custom_model = als.fit(df_all_ratings)

In [0]:
#custom_movies_ids

# filter out the custom rated movies from the df_movies
# run custom model on the filtered df_movies to get predicted ratings for the custom movies

In [0]:
from pyspark.sql.functions import lit, col, desc
 
print(f'movies before: {df_movies.count()}')
df_movies_unrated = df_movies.join(df_custom_ratings, on="movieId", how="left_anti")
print(f'movies after: {df_movies_unrated.count()}')
 
df_for_prediction = df_movies_unrated.withColumn("userId", lit(0))
df_predictions = custom_model.transform(df_for_prediction)
df_recommendations = df_predictions.filter(df_predictions.prediction != float('nan')) \
                                   .orderBy(desc("prediction"))
display(df_recommendations.select("title", "genres", "prediction"))

In [0]:
df_recommendations.select("prediction").describe().show()