# MovieLens: Spark-based Big Data Recommendation Analysis

# Task4-Part1-ALS Model
---

# 1. Session Creating and Data Loading

In [1]:
# Importing the libraries
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

# Creating the SparkSession
spark = SparkSession.builder.appName("Ratings Analysis").getOrCreate()

# Loading the dataset
small_ratings_raw_data = spark.sparkContext.textFile("gs://dataproc-staging-asia-southeast2-933547737015-zijhgarf/ratings.csv")
small_ratings_raw_data_header = small_ratings_raw_data.take(1)[0]

# Preprocessing the dataset
small_ratings_data = small_ratings_raw_data.filter(lambda line: line!=small_ratings_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3]))).cache()

# Showing the first 3 rows of the dataset
small_ratings_data.take(3)

# 2. Model Training and Testing

In [49]:
# Creating the dataframe
ratings = spark.createDataFrame(small_ratings_data)
# Splitting the dataset into training and test set
(training, test) = ratings.randomSplit([0.7, 0.3])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)

                                                                                

Root-mean-square error = 0.8217963309120487


In [50]:
spark.stop()