--------------------------------
# <center>- Spark Project - SD 701 - Data Mining-</center>

* https://spark.apache.org/docs/2.1.0/ml-classification-regression.html
* https://spark.apache.org/docs/latest/mllib-collaborative-filtering.html
* https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html

----------------------------
* https://medium.com/@patelneha1495/recommendation-system-in-python-using-als-algorithm-and-apache-spark-27aca08eaab3
* https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-2-alternating-least-square-als-matrix-4a76c58714a1

----------------------------------

In [2]:
import pyspark
#from pyspark import SparkConf, SparkContext

#from pyspark.ml.classification import LogisticRegression
#from pyspark.ml.regression import LinearRegression

from pyspark.sql import Row, SparkSession

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS


### 1. Initialisation de la session spark

In [3]:
# set up environment
#conf = SparkConf() \
#    .setAppName("MovieLensALS") \
#    .set("spark.executor.memory", "2g")
#sc = SparkContext(conf=conf)

spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()

### 2. Chargement et préparation des données d'évaluations des films

In [4]:
path_data = "/home/p5hngk/Downloads/GitHub/SD_701---Data_Mining/ml-latest-small"

df_ratings = spark.read.format("csv").option("header", "true").load(path_data+"/ratings.csv")
df_ratings.show(10)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
+------+-------+------+---------+
only showing top 10 rows



In [5]:
df_ratings1 = df_ratings.select(df_ratings['userId'], df_ratings['movieId'], df_ratings['rating'])
df_ratings1.show(10)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
+------+-------+------+
only showing top 10 rows



In [6]:
df_ratings1.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)



Avant de réaliser un modèle ALS, il faut que toutes nos données soient au format integer pour pouvoir réaliser les calculs, ce qui n'est pas le cas actuellement. Occupons-nous donc dans un premier temps de changer cela .

In [7]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [8]:
from pyspark.sql.types import IntegerType

df_ratings1 = df_ratings1.withColumn("userId", df_ratings1["userId"].cast(IntegerType())) \
                .withColumn("movieId", df_ratings1["movieId"].cast(IntegerType())) \
                .withColumn("rating", df_ratings1["rating"].cast(IntegerType()))

df_ratings1.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: integer (nullable = true)



### 3. Création des jeux d'entraînement et de test

In [9]:
(training,test) = df_ratings1.randomSplit([0.8, 0.2])

### 4. Création du modèle ALS et entraînement du modèle

In [83]:
als = ALS(maxIter=5, regParam=0.15, rank=25, userCol = "userId", itemCol = "movieId", ratingCol = "rating", coldStartStrategy = "drop", nonnegative=True)
model = als.fit(training)

### 5. Evaluation du modèle

In [86]:
evaluator = RegressionEvaluator(metricName = "rmse", labelCol = "rating", predictionCol = "prediction")
predictions = model.transform(test)
rmse = evaluator.evaluate(predictions)

In [87]:
print(f"RMSE = {round(rmse,3)}")
predictions.show(10)

RMSE = 0.912
+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   602|    471|     4| 3.1934433|
|   462|    471|     2| 2.7162755|
|   217|    471|     2| 2.7824652|
|   171|    471|     3|  4.175868|
|   287|    471|     4| 2.2260783|
|   469|    471|     5| 3.3332288|
|   307|    833|     1| 0.8265029|
|   177|   1088|     3|   3.22892|
|   554|   1088|     5| 3.3106422|
|   286|   1088|     3| 2.8980076|
+------+-------+------+----------+
only showing top 10 rows



In [25]:
predictions.describe().show()

+-------+------------------+------------------+-----------------+------------------+
|summary|            userId|           movieId|           rating|        prediction|
+-------+------------------+------------------+-----------------+------------------+
|  count|             19335|             19335|            19335|             19335|
|   mean|   324.51874838376|17180.324385828808|3.369847426945953|3.1990105139020804|
| stddev|181.40932579600593|32340.785112503912|1.085566007162708|0.7425755555118295|
|    min|                 1|                 1|                0|               0.0|
|    max|               610|            187595|                5|          5.506267|
+-------+------------------+------------------+-----------------+------------------+



In [32]:
predictions.orderBy(predictions.userId.asc()).show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|     1|   3273|     5| 2.7050326|
|     1|    441|     4| 4.5551343|
|     1|    163|     5| 4.0466523|
|     1|    590|     4| 4.2134485|
|     1|   3740|     4| 4.7146983|
|     1|   3489|     4| 3.5443125|
|     1|   2492|     4| 3.2470827|
|     1|   1089|     5|  4.778551|
|     1|   2193|     4| 4.6384244|
|     1|   1240|     5| 4.3902125|
|     1|   2985|     4| 3.9428837|
|     1|   2414|     3| 3.9190807|
|     1|   3439|     4| 3.1636713|
|     1|   1500|     4| 4.3516064|
|     1|   2640|     4|  4.639884|
|     1|   1777|     4| 4.0192113|
|     1|   2542|     5|  4.635328|
|     1|   1927|     5| 4.4290094|
|     1|    367|     4|  3.847385|
|     1|      3|     4|  3.852513|
+------+-------+------+----------+
only showing top 20 rows



In [None]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(spark)
predictions.registerTempTable("predictions_table")

In [56]:
sqlContext.sql('SELECT * FROM predictions_table WHERE userId = 1 ORDER BY movieId ASC').show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|     1|      3|     4|  3.852513|
|     1|     50|     5| 4.5171766|
|     1|    163|     5| 4.0466523|
|     1|    367|     4|  3.847385|
|     1|    441|     4| 4.5551343|
|     1|    457|     5| 4.4773464|
|     1|    480|     4|  4.376313|
|     1|    527|     5|  4.645906|
|     1|    590|     4| 4.2134485|
|     1|   1042|     4| 3.8467429|
|     1|   1080|     5|  4.726887|
|     1|   1089|     5|  4.778551|
|     1|   1097|     5| 4.4196105|
|     1|   1213|     5| 4.7464733|
|     1|   1240|     5| 4.3902125|
|     1|   1282|     5| 4.4678564|
|     1|   1408|     3| 4.8702917|
|     1|   1500|     4| 4.3516064|
|     1|   1777|     4| 4.0192113|
|     1|   1805|     4| 3.6425023|
+------+-------+------+----------+
only showing top 20 rows



### 6. Amélioration du modèle

Pouvons-nous améliorer notre modèle avec de meilleurs hyperparamètres ? Nous allons regarder ici l'influence des paramètres de régularisations (`regParams`) et le nombre de features (`ranks`) à utiliser pour notre modèle. Nous chercherons ici à minimiser le risque moyen quadratique (***RMSE***).

In [78]:
def tune_ALS(train_data, validation_data, maxIter, regParams, ranks):
    """
    grid search function to select the best model based on RMSE of
    validation data
    Parameters
    ----------
    train_data: spark DF with columns ['userId', 'movieId', 'rating']
    
    validation_data: spark DF with columns ['userId', 'movieId', 'rating']
    
    maxIter: int, max number of learning iterations
    
    regParams: list of float, one dimension of hyper-param tuning grid
    
    ranks: list of float, one dimension of hyper-param tuning grid
    
    Return
    ------
    The best fitted ALS model with lowest RMSE score on validation data
    """
    # initial
    min_error = float('inf')
    best_rank = -1
    best_regularization = 0
    best_model = None
    for rank in ranks:
        for reg in regParams:
            # get ALS model
            als = ALS(userCol = "userId", itemCol = "movieId", ratingCol = "rating", coldStartStrategy = "drop", nonnegative=True).setMaxIter(maxIter).setRank(rank).setRegParam(reg)
            # train ALS model
            model = als.fit(train_data)
            # evaluate the model by computing the RMSE on the validation data
            predictions = model.transform(validation_data)
            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rating",
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(predictions)
            print('{} latent factors and regularization = {}: '
                  'validation RMSE is {}'.format(rank, reg, rmse))
            if rmse < min_error:
                min_error = rmse
                best_rank = rank
                best_regularization = reg
                best_model = model
    print('\nThe best model has {} latent factors and '
          'regularization = {}'.format(best_rank, best_regularization))
    return best_model

In [88]:
regParams = [0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2]
ranks = [15, 20, 25, 30]

tune_ALS(training, test, 10, regParams, ranks)

15 latent factors and regularization = 0.1: validation RMSE is 0.9194962266285418
15 latent factors and regularization = 0.11: validation RMSE is 0.9158210040273436
15 latent factors and regularization = 0.12: validation RMSE is 0.9135584134461238
15 latent factors and regularization = 0.13: validation RMSE is 0.9122796084347585
15 latent factors and regularization = 0.14: validation RMSE is 0.9118776751844955
15 latent factors and regularization = 0.15: validation RMSE is 0.9121690831477112
15 latent factors and regularization = 0.16: validation RMSE is 0.9130609447437331
15 latent factors and regularization = 0.17: validation RMSE is 0.9144633840148606
15 latent factors and regularization = 0.18: validation RMSE is 0.916332021519337
15 latent factors and regularization = 0.19: validation RMSE is 0.9185967659831803
15 latent factors and regularization = 0.2: validation RMSE is 0.9212117989211674
20 latent factors and regularization = 0.1: validation RMSE is 0.9218943653846164
20 laten

ALS_9818eab060a0

### 7. Création du modèle avec les meilleurs hyperparamètres

In [10]:
best_als = ALS(maxIter=10, regParam=0.14, rank=25, userCol = "userId", itemCol = "movieId", ratingCol = "rating", coldStartStrategy = "drop", nonnegative=True)
best_model = best_als.fit(training)

In [54]:
evaluator = RegressionEvaluator(metricName = "rmse", labelCol = "rating", predictionCol = "prediction")
predictions = best_model.transform(test)
rmse = evaluator.evaluate(predictions)

### 8. Evaluation du nouveau modèle

In [55]:
print(f"RMSE = {round(rmse,3)}")
predictions.show(10)

RMSE = 0.918
+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|    57|    471|     3| 3.0805037|
|   610|    471|     4| 3.0505595|
|   555|    471|     3|  3.372868|
|   176|    471|     5| 3.6114209|
|   312|    471|     4| 3.4076772|
|   414|    471|     5|  3.238184|
|   426|    471|     5| 3.0732331|
|   373|    471|     5| 3.4007857|
|   357|    471|     3|  3.180214|
|   159|   1088|     4|  2.875018|
+------+-------+------+----------+
only showing top 10 rows



In [56]:
predictions = predictions.withColumn('rating-prediction', predictions.rating - predictions.prediction)
predictions.show(5)

+------+-------+------+----------+-----------------+
|userId|movieId|rating|prediction|rating-prediction|
+------+-------+------+----------+-----------------+
|    57|    471|     3| 3.0805037|       -0.0805037|
|   610|    471|     4| 3.0505595|        0.9494405|
|   555|    471|     3|  3.372868|      -0.37286806|
|   176|    471|     5| 3.6114209|        1.3885791|
|   312|    471|     4| 3.4076772|        0.5923228|
+------+-------+------+----------+-----------------+
only showing top 5 rows



In [57]:
type(predictions)

pyspark.sql.dataframe.DataFrame

In [58]:
#from pyspark.sql.functions import udf
#import pyspark.sql.functions as F
#from pyspark.sql.types import *

from pyspark.sql.functions import when

predictions = predictions.withColumn("rating-prediction", when(0 <= predictions['rating-prediction'], predictions['rating-prediction']).otherwise(predictions['rating-prediction']*(-1)))
predictions = predictions.withColumn("good_pred", when(predictions['rating-prediction'] <= 0.5, 1).otherwise(0))

predictions.show(5)


+------+-------+------+----------+-----------------+---------+
|userId|movieId|rating|prediction|rating-prediction|good_pred|
+------+-------+------+----------+-----------------+---------+
|    57|    471|     3| 3.0805037|        0.0805037|        1|
|   610|    471|     4| 3.0505595|        0.9494405|        0|
|   555|    471|     3|  3.372868|       0.37286806|        1|
|   176|    471|     5| 3.6114209|        1.3885791|        0|
|   312|    471|     4| 3.4076772|        0.5923228|        0|
+------+-------+------+----------+-----------------+---------+
only showing top 5 rows



In [59]:
predictions.show(30)

+------+-------+------+----------+-----------------+---------+
|userId|movieId|rating|prediction|rating-prediction|good_pred|
+------+-------+------+----------+-----------------+---------+
|    57|    471|     3| 3.0805037|        0.0805037|        1|
|   610|    471|     4| 3.0505595|        0.9494405|        0|
|   555|    471|     3|  3.372868|       0.37286806|        1|
|   176|    471|     5| 3.6114209|        1.3885791|        0|
|   312|    471|     4| 3.4076772|        0.5923228|        0|
|   414|    471|     5|  3.238184|         1.761816|        0|
|   426|    471|     5| 3.0732331|        1.9267669|        0|
|   373|    471|     5| 3.4007857|        1.5992143|        0|
|   357|    471|     3|  3.180214|       0.18021393|        1|
|   159|   1088|     4|  2.875018|        1.1249821|        0|
|   606|   1088|     3| 2.9591813|       0.04081869|        1|
|    47|   1088|     4| 2.9026241|        1.0973759|        0|
|   169|   1088|     4|  3.768289|       0.23171091|   

In [60]:
predictions.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+------------------+
|summary|            userId|           movieId|            rating|        prediction| rating-prediction|         good_pred|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+
|  count|             19507|             19507|             19507|             19507|             19507|             19507|
|   mean| 320.7330701799354|17852.724252832315|3.3786333111190854| 3.161560874994356|0.7160302898277231|0.4368688163223458|
| stddev|183.01299415268582| 33082.92588926088|1.0898667880579735|0.7004750676669234|0.5747069787353398|0.4960111550082274|
|    min|                 1|                 1|                 0|               0.0|      3.1232834E-5|                 0|
|    max|               610|            188301|                 5|         5.3274026|               5.0|                 1|
+-------

In [66]:
predictions.groupBy().avg('good_pred').show()

+------------------+
|    avg(good_pred)|
+------------------+
|0.4368688163223458|
+------------------+



Cela signifie que nous notre modèle nous permet de réaliser environ **44 %** de prédictions correctes, *i.e.* à plus ou moins 0.5 étoile près, sur le dataset d'entraînement.

### 11. Jointure avec les données des titres des films

In [68]:
path_data = "/home/p5hngk/Downloads/GitHub/SD_701---Data_Mining/ml-latest-small"

df_movies = spark.read.format("csv").option("header", "true").load(path_data+"/movies.csv")
df_movies.show(10)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
+-------+--------------------+--------------------+
only showing top 10 rows



In [88]:
predictions2 = predictions.join(df_movies, on=['movieId'], how='left_outer')

In [89]:
predictions2.orderBy(new.movieId.asc()).show(50)

+-------+------+------+----------+-----------------+---------+----------------+--------------------+
|movieId|userId|rating|prediction|rating-prediction|good_pred|           title|              genres|
+-------+------+------+----------+-----------------+---------+----------------+--------------------+
|      1|   596|     4| 3.2742593|        0.7257407|        0|Toy Story (1995)|Adventure|Animati...|
|      1|   525|     4| 3.2849152|        0.7150848|        0|Toy Story (1995)|Adventure|Animati...|
|      1|   332|     4| 3.4803617|        0.5196383|        0|Toy Story (1995)|Adventure|Animati...|
|      1|   156|     4| 3.4410477|       0.55895233|        0|Toy Story (1995)|Adventure|Animati...|
|      1|   534|     4|  3.994771|     0.0052289963|        1|Toy Story (1995)|Adventure|Animati...|
|      1|    71|     5| 3.7085998|        1.2914002|        0|Toy Story (1995)|Adventure|Animati...|
|      1|   541|     3| 3.7809315|        0.7809315|        0|Toy Story (1995)|Adventure|An

### 10. Réalisation de recommandations

In [67]:
def make_recommendations(self, fav_movie, n_recommendations):
    """
    make top n movie recommendations
    Parameters
    ----------
    fav_movie: str, name of user input movie
    n_recommendations: int, top n recommendations
    """
    # get data
    movie_user_mat_sparse, hashmap = self._prep_data()
    # get recommendations
    raw_recommends = self._inference(
        self.model, movie_user_mat_sparse, hashmap,
        fav_movie, n_recommendations)
    # print results
    reverse_hashmap = {v: k for k, v in hashmap.items()}
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance '
              'of {2}'.format(i+1, reverse_hashmap[idx], dist))

In [101]:
# Generate top 10 movie recommendations for each user
userRecs = best_model.recommendForAllUsers(10).show(10)

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[7842, 4.5132], ...|
|   463|[[7842, 5.144905]...|
|   496|[[7842, 4.533911]...|
|   148|[[8477, 4.655421]...|
|   540|[[7842, 5.5218005...|
|   392|[[8477, 4.842426]...|
|   243|[[67618, 5.862327...|
|    31|[[33649, 5.740833...|
|   516|[[4429, 4.7700167...|
|   580|[[7842, 5.212381]...|
+------+--------------------+
only showing top 10 rows



In [102]:
# Generate top 10 user recommendations for each movie
movieRecs = best_model.recommendForAllItems(10).show(10)

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|   1580|[[53, 4.7668843],...|
|   4900|[[53, 4.532554], ...|
|   5300|[[53, 4.2717924],...|
|   6620|[[191, 4.574413],...|
|   7340|[[53, 3.979438], ...|
|  32460|[[53, 5.301957], ...|
|  54190|[[53, 5.637684], ...|
|    471|[[53, 4.8591766],...|
|   1591|[[37, 3.6290343],...|
|   1342|[[171, 3.5377822]...|
+-------+--------------------+
only showing top 10 rows



In [None]:
# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = best_model.recommendForUserSubset(users, 10)

In [None]:
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = best_model.recommendForItemSubset(movies, 10)

------------------------------------
-----------------------------------