<a href="https://colab.research.google.com/github/OliverRevilla/BigData_Pyspark/blob/main/Building_Recommendation_Engines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Preparation for Spark ALS**

In [None]:
from pysaprk.sql.functions import monotically_increasing_id
users = users.coalesce(1)
users = users.withColumn(
    'userId', monotically_increasing_id()).persist()
)

#--------------------------------------------------------------
# Import monotonically_increasing_id and show R
from pyspark.sql.functions import monotonically_increasing_id
R.show()


# Use the to_long() function to convert the dataframe to the "long" format.
ratings = to_long(R)
ratings.show()

# Get unique users and repartition to 1 partition
users = ratings.select("User").distinct().coalesce(1)

# Create a new column of unique integers called "userId" in the users dataframe.
users = users.withColumn("userId", monotonically_increasing_id()).persist()
users.show()

# Extract the distinct movie id's
movies = ratings.select("Movie").distinct() 

# Repartition the data to have only one partition.
movies = movies.coalesce(1) 

# Create a new column of movieId integers. 
movies = movies.withColumn("movieId", monotonically_increasing_id()).persist() 

# Join the ratings, users and movies dataframes
movie_ratings = ratings.join(users, "User", "left").join(movies, "Movie", "left")
movie_ratings.show()


## **ALS parameters and hyperparameters**

In [None]:
# Example of ALS model Code
als_model = ALS(userCol = 'userID', # Name of column that contains user ID's
                itemCol = 'movieID', # Name of column that contains item id's
                ratingCol = 'rating', # Name of column that contains ratings
                rank = 25, # number of latent features
                maxIter = 100, # how many times ALS should iterate 
                regParam = .05, # Lambda
                alpha = 40, # Only used with implicit ratings
                nonnegative = True, # Ensures positive numbers
                coldStartStrategy = 'drop', # To avoid the coincidence when train set will be equal to test set of one ID.
                implicitPrefs = False) # It's neccesary say to pyspark if our data are implicit or explicit

# Fit AlS to training dataset
model = als.fit(training_data)

# Generate predictions on test dataset
predictions = model.transform(test_data)

#--------------------------------------------------------------------------------------------------
# Split the ratings dataframe into training and test data
(training_data, test_data) = ratings.randomSplit([0.8, 0.2], seed=42)

# Set the ALS hyperparameters
from pyspark.ml.recommendation import ALS
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", rank =10, maxIter =15, regParam =.1,
          coldStartStrategy="drop", nonnegative =True, implicitPrefs = False)

# Fit the mdoel to the training_data
model = als.fit(training_data)

# Generate predictions on the test_data
test_predictions = model.transform(test_data)
test_predictions.show()

# Import RegressionEvaluator
from pyspark.ml.evaluation import RegressionEvaluator

# Complete the evaluator code
evaluator = RegressionEvaluator(metricName="RMSE", labelCol="rating", predictionCol="prediction")

# Extract the 3 parameters
print(evaluator.getMetricName())
print(evaluator.getLabelCol())
print(evaluator.getPredictionCol())

# Evaluate the "test_predictions" dataframe
RMSE = evaluator.evaluate(test_predictions)

# Print the RMSE
print (RMSE)


## **Recommending algorithm**

### **Sparsity**

In [None]:
# Number of ratings in matrix
numerator = ratings.count()

# Distinct users and movies
users = ratings.select("userId").distinct().count()
movies = ratings.select('movieId').distinct().count()

# Number of ratings matrix could contain if no empty cells
denominator = users*movies

# Calculating sparsity
sparsity = 1 - (numerator*0.8/denominator)

print("Sparsity: "), sparsity

### **ALS model buildout**

In [None]:
# It is useful when someone want to test other parameters for the model to improve the metrics
#ParamGridBuilder()

from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder()
                      .addGrid(als.rank, [5,40,80,120])
                      .addGrid(als.maxIter, [5,100,250,500])
                      .addGrid(als.regParam, [.05,.1,1.5])
                      .build()

#CrossValidator()
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator = als,
                    estimatorParamMaps = param_grid,
                    evaluator = evaluator,
                    numFolds = 5)



In [None]:
# Complete Code

# Create training and test set (80/20 split)
(training, test) = movie_ratings.randomSplit([0.8,0.2])

# Build generic ALS model without hyperparameters
from pyspark.ml.recommendation import ALS
als = ALS(userCol = "userId",
          itemCol = "movieId",
          ratingCol = "rating",
          coldStartStrategy = "drop",
          nonnegative = True,
          implicitPrefs = False)

# Tell Spark what values to try for each hyperparameter
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder()
                    .addGrid(als.rank, [5,40,80,120])
                    .addGrid(als.maxIter, [5,100,250,500])
                    .addGrid(als.regParam, [.05,.1,1.5])
                    .build()

# Tell Spark how to evaluate model performance
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName = "rmse", labelCol = "rating",
                                predictionCol = "prediction ")

# Build cross validation step using CrossValidator
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator = als,
                    estimatorParamMaps = param_grid,
                    evaluator = evaluator,
                    numFolds = 5)

# Run the cv on the training data
model = cv.fit(training)

# Extract best combination of values from cross Validation
best_model = model.bestModel

# Generate test set predictions and evaluate using RMSE
predictions = best_model.transform(test)
rmse = evaluator.evaluate(predictions)

# Print evaluation metrics and model parameters
print("**Best Model**")
print("RMSE = "), rmse
print(" Rank: "), best_model.rank
print(" MaxIter: "), best_model._java_obj.parent().getMaxIter()
print(" RegParam: "), best_model._java_obj.parent().getRegParam()



In [None]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create test and train set
(train, test) = ratings.randomSplit([0.8, 0.2], seed = 1234)

# Create ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", nonnegative = True, implicitPrefs = False)

# Confirm that a model called "als" was created
type(als)

# Import the requisite items
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder()\
            .addGrid(als.rank,[10,50,100,150])\
            .addGrid(als.maxIter,[5,50,100,200])\
            .addGrid(als.regParam,[.01,.05,.1,.15])\
            .build()           
           
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps = param_grid, evaluator= evaluator, numFolds= 5)

# Confirm cv was built
print(cv)

# Print best_model
print(type(best_model))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# Print "Rank"
print("  Rank:", best_model.getRank())

# Print "MaxIter"
print("  MaxIter:", best_model.getMaxIter())

# Print "RegParam"
print("  RegParam:", best_model.getRegParam())


### **ALS model without ratings**

In [None]:
def add_zeros(df):
  # Extracts distinct users
  users = df.select("userId").distinct()

  # Extracts distinct songs
  songs = df.select("songId").distinct()

  # Joins users and songs, fills blanks with 0
  cross_join = users.crossJoin(songs)\
                    .join(df,['userId','songId'], "left").fillna(0)
  return cross_join


In [None]:
#  Building several ROEM models

(train, test) =  implicit_ratings.randomSplit([.8,.2])
# Empty list to be filled with models
model_list = []

# Complete each of the hyperparameter value list
ranks = [10,20,30,40]
maxIters = [10,20,30,40]
regParams = [.05,.1,.15]
alphas = [20,40,60,80]

# For loop will automatically create and store ALS models
for r in ranks:
  for mi in maxIters:
    for rp in regParams:
      for a in alphas:
        model_list.append(ALS(userCol = "userId", itemCol = "songId",
                              ratingCol = "num_plays", rank = r, maxIter = mi, regParam = rp,
                              alpha = a, coldStartStrategy = "drop", nonnegative = True,
                              implicitPrefs = True)
        
# Error output
for model in model_list:
  # Fits each model to the trainning data
  trained_model = model.fit(train)

  # Generates test predictions
  predictions = trained_model.transform(test)

  # Evaluates each  model's performance
  ROEM(predictions)


### -----------------------------------------------------------------------------
  # For loop will automatically create and store ALS models
for r in ranks:
    for mi in maxIters:
        for rp in regParams:
            for a in alphas:
                model_list.append(ALS(userCol= "userId", itemCol= "songId", ratingCol= "num_plays", rank = r, maxIter = mi, regParam = rp, alpha = a, coldStartStrategy="drop", nonnegative = True, implicitPrefs = True))

# Print the model list, and the length of model_list
print (model_list, "Length of model_list: ", len(model_list))

# Validate
len(model_list) == (len(ranks)*len(maxIters)*len(regParams)*len(alphas))

In [None]:
# View user 26's original ratings
print ("User 26 Original Ratings:")
original_ratings.filter(col("userId") == 26).show()

# View user 26's recommendations
print ("User 26 Recommendations:")
binary_recs.filter(col("userId") == 26).show()

# View user 99's original ratings
print ("User 99 Original Ratings:")
original_ratings.filter(col("userId") == 99).show()

# View user 99's recommendations
print ("User 99 Recommendations:")
binary_recs.filter(col("userId") == 99).show()