In [None]:
pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 69kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 22.1MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=bb08cb9253c4c9cda364cc20fcbde62f7688b97feb4325b8229c6b261c918aa4
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Movies Recommendation (Collaborative Filtering)").config("spark.driver.memory","15g").config("spark.executor.memory", "15g").getOrCreate()

In [None]:
# movies = spark.read.load("/content/drive/MyDrive/Colab Notebooks/movie/movies.csv", format='csv', header = True)
# ratings = spark.read.load("/content/drive/MyDrive/Colab Notebooks/movie/ratings.csv", format='csv', header = True)
movies = spark.read.load("/content/drive/MyDrive/Colab Notebooks/movie/full/movies.csv", format='csv', header = True)
ratings = spark.read.load("/content/drive/MyDrive/Colab Notebooks/movie/full/ratings.csv", format='csv', header = True)

In [None]:
# movies.show()

In [None]:
ratings = ratings.select("userId", "movieId", "rating")

In [None]:
# ratings.printSchema()

In [None]:
df = ratings.withColumn('userId', ratings['userId'].cast('int')).\
withColumn('movieId', ratings['movieId'].cast('int')).withColumn('rating', ratings['rating'].cast('float'))

In [None]:
train, validation, test = df.randomSplit([0.6,0.2,0.2], seed = 0)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

def RMSE(predictions):
    return evaluator.evaluate(predictions)


In [None]:
from pyspark.ml.recommendation import ALS

def GridSearch(train, valid, num_iterations, reg_param, n_factors):
    min_rmse = float('inf')
    best_n = -1
    best_reg = 0
    best_model = None
    for n in n_factors:
        for reg in reg_param:
            als = ALS(rank = n, 
                      maxIter = num_iterations, 
                      seed = 0, 
                      regParam = reg,
                      userCol="userId", 
                      itemCol="movieId", 
                      ratingCol="rating", 
                      coldStartStrategy="drop")            
            model = als.fit(train)
            predictions = model.transform(valid)
            rmse = RMSE(predictions)     
            if rmse < min_rmse:
                min_rmse = rmse
                best_n = n
                best_reg = reg
                best_model = model
                
    pred = best_model.transform(train)
    train_rmse = RMSE(pred)
    return best_model

In [None]:
num_iterations = 15
ranks = [7, 8, 9]
reg_params = [0.1, 0.2, 0.3]

# final_model = GridSearch(train, validation, num_iterations, reg_params, ranks)

In [None]:
from pyspark.ml.recommendation import ALS

als = ALS(
            rank = 9, 
            maxIter = 15, 
            seed = 0, 
            regParam = 0.2,
            userCol="userId", 
            itemCol="movieId", 
            ratingCol="rating", 
            coldStartStrategy="drop"
            )            
final_model = als.fit(train)

In [None]:
pred_test = final_model.transform(test)
print ('The testing RMSE is ' + str(RMSE(pred_test)))

The testing RMSE is 0.8692651003430641


In [None]:
user_id = [[17]]
# convert this into a dataframe so that it can be passed into the recommendForUserSubset
functiondf = spark.sparkContext.parallelize(user_id).toDF(['userId'])
num_rec = 10
recommendations = final_model.recommendForUserSubset(functiondf , num_rec)
recommendations.collect()
# pick only the ISBN of the books, ignore other fields
recommended_ISBN = [recommendations.collect()[0]['recommendations'][x]['movieId'] for x in range(0,num_rec)]
# recommended_ISBN

In [None]:
# movies.show()

In [None]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col
rec_df = spark.createDataFrame(recommended_ISBN, IntegerType())
# rec_df.show()
# print('Top ',num_rec,' book recommendations for User-ID ',user_id[0][0], ' are:')
movies.join(rec_df,rec_df.value==movies.movieId).select(col('movieId'),col('title'),col('genres')).show()

+-------+--------------------+------------------+
|movieId|               title|            genres|
+-------+--------------------+------------------+
| 185659|Macho Madness - T...|(no genres listed)|
| 159761|         Loot (1970)|      Comedy|Crime|
| 187157|Cooking with Love...|    Comedy|Romance|
| 182781|Amori che non san...|      Comedy|Drama|
| 182489|12 Men of Christm...|    Comedy|Romance|
| 151989|    The Thorn (1971)|            Comedy|
| 107434|Diplomatic Immuni...|            Comedy|
| 107252|Island at War (2004)|         Drama|War|
| 192261|Don't Laugh at My...|      Comedy|Drama|
|   8394| Hi-Line, The (1999)|             Drama|
+-------+--------------------+------------------+

