In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("movie-recommendation")\
    .config("spark.executor.memory", MAX_MEMORY)\
    .config("spark.driver.memory", MAX_MEMORY)\
    .getOrCreate()

In [4]:
ratings_file = "/home/jovyan/ml-25m/ratings.csv"
ratings_df = spark.read.csv(f"file:///{ratings_file}", inferSchema=True, header=True)

In [5]:
ratings_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



In [6]:
ratings_df = ratings_df.select(["userId", "movieId", "rating"])

In [7]:
ratings_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [8]:
ratings_df.select("rating").describe().show()

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|          25000095|
|   mean| 3.533854451353085|
| stddev|1.0607439611423535|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+



In [9]:
train_df, test_df = ratings_df.randomSplit([0.8, 0.2])

In [10]:
from pyspark.ml.recommendation import ALS

In [11]:
als = ALS(
    maxIter=5,
    regParam=0.1,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"   # 학습하지 못한 데이터를 만났을 때 어떻게 대처해야하는가
)

In [12]:
model = als.fit(train_df)

In [13]:
predictions = model.transform(test_df)   # 학습 모델에 대한 test

In [14]:
predictions.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|    12|    471|   4.0|  3.600957|
|    76|   1959|   5.0|  3.605424|
|    85|   1088|   2.0|  3.745204|
|    91|   8638|   3.0| 3.6412697|
|   132|   1238|   5.0|  3.384017|
|   233|   1580|   5.0| 3.7534525|
|   243|   1580|   3.0|  2.705996|
|   296|   1580|   3.0|  2.717266|
|   321|   3175|   3.0| 3.2876143|
|   321|   6620|   3.5|   3.72178|
|   346|    471|   5.0|  3.736539|
|   368|  54190|   4.0|  3.542121|
|   409|   8638|   5.0|  4.075141|
|   416|   1580|   3.0| 3.1203487|
|   416|   1645|   4.0| 3.2283409|
|   472|   3918|   3.0| 2.4157932|
|   497|   2366|   4.0| 3.8563902|
|   501|   1580|   5.0| 3.9781423|
|   513|  44022|   5.0| 4.3278885|
|   548|   5803|   2.5| 2.7255204|
+------+-------+------+----------+
only showing top 20 rows



In [15]:
predictions.select('rating', 'prediction').describe().show()

+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|           4996925|           4996925|
|   mean|3.5341806210819655|3.4304302104571276|
| stddev|1.0608390722194045|0.6447696311975811|
|    min|               0.5|        -0.8891341|
|    max|               5.0|          6.814935|
+-------+------------------+------------------+



In [16]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName="rmse", labelCol='rating', predictionCol='prediction')

In [17]:
rmse = evaluator.evaluate(predictions)

In [18]:
print(rmse)

0.8083234516279967


In [19]:
model.recommendForAllUsers(3).show()   # 유저 별 top3개의 아이템 추천

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    12|[{194434, 5.39732...|
|    22|[{199187, 7.49332...|
|    26|[{203882, 5.33347...|
|    27|[{203882, 6.17983...|
|    28|[{194434, 7.68741...|
|    31|[{203882, 3.99617...|
|    34|[{77344, 6.42055}...|
|    44|[{194434, 6.53651...|
|    47|[{203882, 5.61025...|
|    53|[{192089, 6.41560...|
|    65|[{126737, 5.70968...|
|    76|[{194434, 6.02048...|
|    78|[{77344, 6.704458...|
|    81|[{203882, 4.43484...|
|    85|[{203882, 6.10054...|
|    91|[{203882, 5.38347...|
|    93|[{120821, 6.19903...|
|   101|[{203882, 5.30982...|
|   103|[{194434, 6.31353...|
|   108|[{194434, 5.08309...|
+------+--------------------+
only showing top 20 rows



In [20]:
model.recommendForAllItems(3).show()   # 아이템 별 top3명의 유저 추천

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     12|[{87426, 5.216188...|
|     22|[{87426, 5.230874...|
|     26|[{105801, 5.03825...|
|     27|[{87426, 5.576849...|
|     28|[{105801, 5.46533...|
|     31|[{87426, 5.328863...|
|     34|[{32202, 5.170934...|
|     44|[{87426, 5.272593...|
|     47|[{156318, 5.57046...|
|     53|[{103920, 5.32841...|
|     65|[{87426, 5.049478...|
|     76|[{149507, 5.05941...|
|     78|[{142811, 4.73541...|
|     81|[{142811, 4.89259...|
|     85|[{142811, 4.98602...|
|     93|[{87426, 5.017465...|
|    101|[{142811, 5.08284...|
|    103|[{96471, 5.084518...|
|    108|[{142811, 5.30106...|
|    111|[{84907, 5.439813...|
+-------+--------------------+
only showing top 20 rows



But, 실제 서비스에서는 보통 특정 유저를 위한 추천을 불러오는 api가 쓰임

In [21]:
from pyspark.sql.types import IntegerType

user_list = [65, 78, 81]
users_df = spark.createDataFrame(user_list, IntegerType()).toDF('userId')

users_df.show()

+------+
|userId|
+------+
|    65|
|    78|
|    81|
+------+



In [22]:
user_recs = model.recommendForUserSubset(users_df, 5)

In [23]:
movies_list = user_recs.collect()[0].recommendations

In [24]:
recs_df = spark.createDataFrame(movies_list)
recs_df.show()

+-------+-----------------+
|movieId|           rating|
+-------+-----------------+
| 126737|5.709682464599609|
| 205277|5.689651012420654|
| 188733|5.618760108947754|
| 152986|5.603871822357178|
| 194434|5.516491413116455|
+-------+-----------------+



In [25]:
movies_file = "/home/jovyan/ml-25m/movies.csv"
movies_df = spark.read.csv(f"file:///{movies_file}", inferSchema=True, header=True)

In [26]:
movies_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [27]:
recs_df.createOrReplaceTempView("recommendations")
movies_df.createOrReplaceTempView("movies")

In [28]:
query = """
SELECT *
FROM
    movies JOIN recommendations
    ON movies.movieId = recommendations.movieId
ORDER BY
    rating desc
"""
recommended_movies = spark.sql(query)
recommended_movies.show()

+-------+--------------------+--------------------+-------+-----------------+
|movieId|               title|              genres|movieId|           rating|
+-------+--------------------+--------------------+-------+-----------------+
| 126737|Dark Dungeons (2014)|       Drama|Fantasy| 126737|5.709682464599609|
| 205277|   Inside Out (1991)|Comedy|Drama|Romance| 205277|5.689651012420654|
| 188733|Five Cartridges (...|           Drama|War| 188733|5.618760108947754|
| 152986|Bose: The Forgott...|        Action|Drama| 152986|5.603871822357178|
| 194434|   Adrenaline (1990)|  (no genres listed)| 194434|5.516491413116455|
+-------+--------------------+--------------------+-------+-----------------+



In [29]:
def get_recommendations(user_id, num_recs):
    users_df = spark.createDataFrame([user_id], IntegerType()).toDF('userId')
    user_recs_df = model.recommendForUserSubset(users_df, num_recs)
    
    recs_list = user_recs_df.collect()[0].recommendations
    recs_df = spark.createDataFrame(recs_list)
    recommended_movies = spark.sql(query)
    return recommended_movies

In [30]:
recs = get_recommendations(456, 10)

In [31]:
recs.toPandas()

Unnamed: 0,movieId,title,genres,movieId.1,rating
0,126737,Dark Dungeons (2014),Drama|Fantasy,126737,5.709682
1,205277,Inside Out (1991),Comedy|Drama|Romance,205277,5.689651
2,188733,Five Cartridges (1960),Drama|War,188733,5.61876
3,152986,Bose: The Forgotten Hero (2005),Action|Drama,152986,5.603872
4,194434,Adrenaline (1990),(no genres listed),194434,5.516491


In [32]:
spark.stop()