--------------------------------
# <center>- Spark Project - SD 701 - Data Mining-</center>

* https://spark.apache.org/docs/2.1.0/ml-classification-regression.html
* https://spark.apache.org/docs/latest/mllib-collaborative-filtering.html
* https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html

----------------------------
* https://medium.com/@patelneha1495/recommendation-system-in-python-using-als-algorithm-and-apache-spark-27aca08eaab3
* https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-2-alternating-least-square-als-matrix-4a76c58714a1

----------------------------------

In [41]:
import pyspark
#from pyspark import SparkConf, SparkContext

#from pyspark.ml.classification import LogisticRegression
#from pyspark.ml.regression import LinearRegression

from pyspark.sql import Row, SparkSession

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS


### 1. Initialisation de la session spark

In [2]:
# set up environment
#conf = SparkConf() \
#    .setAppName("MovieLensALS") \
#    .set("spark.executor.memory", "2g")
#sc = SparkContext(conf=conf)

spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()

### 2. Chargement et préparation des données d'évaluations des films

In [3]:
path_data = "/home/p5hngk/Downloads/GitHub/SD_701---Data_Mining/ml-latest-small"

df_ratings = spark.read.format("csv").option("header", "true").load(path_data+"/ratings.csv")
df_ratings.show(10)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
+------+-------+------+---------+
only showing top 10 rows



In [6]:
df_ratings1 = df_ratings.select(df_ratings['userId'], df_ratings['movieId'], df_ratings['rating'])
df_ratings1.show(10)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
+------+-------+------+
only showing top 10 rows



In [7]:
df_ratings1.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)



Avant de réaliser un modèle ALS, il faut que toutes nos données soient au format integer pour pouvoir réaliser les calculs, ce qui n'est pas le cas actuellement. Occupons-nous donc dans un premier temps de changer cela .

In [8]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [11]:
from pyspark.sql.types import IntegerType

df_ratings1 = df_ratings1.withColumn("userId", df_ratings1["userId"].cast(IntegerType())) \
                .withColumn("movieId", df_ratings1["movieId"].cast(IntegerType())) \
                .withColumn("rating", df_ratings1["rating"].cast(IntegerType()))

df_ratings1.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: integer (nullable = true)



### 3. Création des jeux d'entraînement et de test

In [13]:
(training,test) = df_ratings1.randomSplit([0.8, 0.2])

### 4. Création du modèle ALS et entraînement du modèle

In [14]:
als = ALS(maxIter=5, regParam=0.09, rank=25, userCol = "userId", itemCol = "movieId", ratingCol = "rating", coldStartStrategy = "drop", nonnegative=True)
model = als.fit(training)

### 5. Evaluation du modèle

In [16]:
evaluator = RegressionEvaluator(metricName = "rmse", labelCol = "rating", predictionCol = "prediction")
predictions = model.transform(test)
rmse = evaluator.evaluate(predictions)

RMSE = 0.923
+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   602|    471|     4| 3.3281708|
|   462|    471|     2|  2.720675|
|   217|    471|     2| 2.6050868|
|   171|    471|     3| 4.2653756|
|   287|    471|     4|  2.048118|
|   469|    471|     5| 3.3836212|
|   307|    833|     1| 0.9147462|
|   177|   1088|     3| 3.4987407|
|   554|   1088|     5| 3.5295796|
|   286|   1088|     3| 2.9365282|
+------+-------+------+----------+
only showing top 10 rows



In [21]:
print(f"RMSE = {round(rmse,3)}")
predictions.show(10)

RMSE = 0.923
+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   602|    471|     4| 3.3281708|
|   462|    471|     2|  2.720675|
|   217|    471|     2| 2.6050868|
|   171|    471|     3| 4.2653756|
|   287|    471|     4|  2.048118|
|   469|    471|     5| 3.3836212|
|   307|    833|     1| 0.9147462|
|   177|   1088|     3| 3.4987407|
|   554|   1088|     5| 3.5295796|
|   286|   1088|     3| 2.9365282|
+------+-------+------+----------+
only showing top 10 rows



In [25]:
predictions.describe().show()

+-------+------------------+------------------+-----------------+------------------+
|summary|            userId|           movieId|           rating|        prediction|
+-------+------------------+------------------+-----------------+------------------+
|  count|             19335|             19335|            19335|             19335|
|   mean|   324.51874838376|17180.324385828808|3.369847426945953|3.1990105139020804|
| stddev|181.40932579600593|32340.785112503912|1.085566007162708|0.7425755555118295|
|    min|                 1|                 1|                0|               0.0|
|    max|               610|            187595|                5|          5.506267|
+-------+------------------+------------------+-----------------+------------------+



In [32]:
predictions.orderBy(predictions.userId.asc()).show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|     1|   3273|     5| 2.7050326|
|     1|    441|     4| 4.5551343|
|     1|    163|     5| 4.0466523|
|     1|    590|     4| 4.2134485|
|     1|   3740|     4| 4.7146983|
|     1|   3489|     4| 3.5443125|
|     1|   2492|     4| 3.2470827|
|     1|   1089|     5|  4.778551|
|     1|   2193|     4| 4.6384244|
|     1|   1240|     5| 4.3902125|
|     1|   2985|     4| 3.9428837|
|     1|   2414|     3| 3.9190807|
|     1|   3439|     4| 3.1636713|
|     1|   1500|     4| 4.3516064|
|     1|   2640|     4|  4.639884|
|     1|   1777|     4| 4.0192113|
|     1|   2542|     5|  4.635328|
|     1|   1927|     5| 4.4290094|
|     1|    367|     4|  3.847385|
|     1|      3|     4|  3.852513|
+------+-------+------+----------+
only showing top 20 rows



In [None]:
from pyspark.sql import SQLContext

sqlContext = SQLContext(spark)
predictions.registerTempTable("predictions_table")

In [56]:
sqlContext.sql('SELECT * FROM predictions_table WHERE userId = 1 ORDER BY movieId ASC').show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|     1|      3|     4|  3.852513|
|     1|     50|     5| 4.5171766|
|     1|    163|     5| 4.0466523|
|     1|    367|     4|  3.847385|
|     1|    441|     4| 4.5551343|
|     1|    457|     5| 4.4773464|
|     1|    480|     4|  4.376313|
|     1|    527|     5|  4.645906|
|     1|    590|     4| 4.2134485|
|     1|   1042|     4| 3.8467429|
|     1|   1080|     5|  4.726887|
|     1|   1089|     5|  4.778551|
|     1|   1097|     5| 4.4196105|
|     1|   1213|     5| 4.7464733|
|     1|   1240|     5| 4.3902125|
|     1|   1282|     5| 4.4678564|
|     1|   1408|     3| 4.8702917|
|     1|   1500|     4| 4.3516064|
|     1|   1777|     4| 4.0192113|
|     1|   1805|     4| 3.6425023|
+------+-------+------+----------+
only showing top 20 rows



### 6. Amélioration du modèle

Pouvons-nous améliorer notre modèle avec de meilleurs hyperparamètres ?

In [61]:
def tune_ALS(training, test, 10, regParams, ranks):
    """
    grid search function to select the best model based on RMSE of
    validation data
    Parameters
    ----------
    train_data: spark DF with columns ['userId', 'movieId', 'rating']
    
    validation_data: spark DF with columns ['userId', 'movieId', 'rating']
    
    maxIter: int, max number of learning iterations
    
    regParams: list of float, one dimension of hyper-param tuning grid
    
    ranks: list of float, one dimension of hyper-param tuning grid
    
    Return
    ------
    The best fitted ALS model with lowest RMSE score on validation data
    """
    # initial
    min_error = float('inf')
    best_rank = -1
    best_regularization = 0
    best_model = None
    for rank in ranks:
        for reg in regParams:
            # get ALS model
            als = ALS().setMaxIter(maxIter).setRank(rank).setRegParam(reg)
            # train ALS model
            model = als.fit(train_data)
            # evaluate the model by computing the RMSE on the validation data
            predictions = model.transform(validation_data)
            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rating",
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(predictions)
            print('{} latent factors and regularization = {}: '
                  'validation RMSE is {}'.format(rank, reg, rmse))
            if rmse < min_error:
                min_error = rmse
                best_rank = rank
                best_regularization = reg
                best_model = model
    print('\nThe best model has {} latent factors and '
          'regularization = {}'.format(best_rank, best_regularization))
    return best_model

In [None]:
tune_ALS(train_data, validation_data, maxIter, regParams, ranks):
    """
    grid search function to select the best model based on RMSE of
    validation data
    Parameters
    ----------
    train_data: spark DF with columns ['userId', 'movieId', 'rating']
    
    validation_data: spark DF with columns ['userId', 'movieId', 'rating']
    
    maxIter: int, max number of learning iterations
    
    regParams: list of float, one dimension of hyper-param tuning grid
    
    ranks: list of float, one dimension of hyper-param tuning grid
    
    Return

### 7. Réalisation de recommandations

In [58]:
# Generate top 10 movie recommendations for each user
userRecs = best_model.recommendForAllUsers(10).show(10)

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[8477, 4.6271734...|
|   463|[[7842, 4.6472483...|
|   496|[[4794, 4.539608]...|
|   148|[[8477, 4.8804374...|
|   540|[[171495, 5.48740...|
|   392|[[8477, 5.045897]...|
|   243|[[945, 5.8989525]...|
|    31|[[3200, 5.7575874...|
|   516|[[4429, 4.8906627...|
|   580|[[6300, 4.9238358...|
+------+--------------------+
only showing top 10 rows



In [60]:
# Generate top 10 user recommendations for each movie
movieRecs = best_model.recommendForAllItems(10).show(10)

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|   1580|[[389, 4.588832],...|
|   4900|[[441, 4.5939], [...|
|   5300|[[74, 3.9770067],...|
|   6620|[[191, 4.949128],...|
|   7340|[[544, 4.2060995]...|
|  32460|[[543, 5.238191],...|
|  54190|[[544, 5.898651],...|
|    471|[[53, 4.7354355],...|
|   1591|[[37, 4.157173], ...|
|   1342|[[258, 3.9858599]...|
+-------+--------------------+
only showing top 10 rows



In [None]:
# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = best_model.recommendForUserSubset(users, 10)

In [None]:
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = best_model.recommendForItemSubset(movies, 10)

------------------------------------
-----------------------------------