In [1]:
from pyspark import SparkConf
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import VectorAssembler, Normalizer
from pyspark.sql import SparkSession, functions, types
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
spark = SparkSession.builder.master("local").appName("Recommendation").getOrCreate()
movies= spark.read.csv("./movies.csv", inferSchema = True, header = True)
ratings = spark.read.csv("./ratings.csv", inferSchema = True, header = True)

2022-02-04 09:35:40,077 WARN util.Utils: Your hostname, DESKTOP-LR48F7J resolves to a loopback address: 127.0.1.1; using 172.30.172.147 instead (on interface eth0)
2022-02-04 09:35:40,078 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-02-04 09:35:41,529 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
movies.show(5, truncate = False)
ratings.show(5)

+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |
+-------+----------------------------------+-------------------------------------------+
only showing top 5 rows

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|   

First, for the ratings table, I use **groupBy()** to group data with the same movieId, **count()** to count the number of ratings for each movieId, and generate a new column "count", and then use **orderBy()** and **functions.desc()** to sort in descending order based on "count", and then use **limit()** to get the ten with the highest counts, so that to get ids of the top-10 movies with the largest number of ratings

Finally, use the id of the top10 movie and the movies table to perform a left join based on "movieId" column, and then use **select()** to select the “title” column to get the top10 movie names

In [4]:
topIds = ratings.groupBy("movieId").count().orderBy(functions.desc("count")).limit(10).select("movieId")
topNames = topIds.join(movies, topIds["movieId"] == movies["movieId"], "left").select("title")
topNames.show(truncate = False)

+-----------------------------------------+
|title                                    |
+-----------------------------------------+
|Forrest Gump (1994)                      |
|Shawshank Redemption, The (1994)         |
|Pulp Fiction (1994)                      |
|Silence of the Lambs, The (1991)         |
|Matrix, The (1999)                       |
|Star Wars: Episode IV - A New Hope (1977)|
|Jurassic Park (1993)                     |
|Braveheart (1995)                        |
|Terminator 2: Judgment Day (1991)        |
|Schindler's List (1993)                  |
+-----------------------------------------+



Since each movie may belong to multiple genres, I first used **functions.explode()** and **functions.split()** split the movie data belonging to multiple genres into multiple lines, each line containing exactly one genre.

Then I used **groupBy()** and **avg()** on the ratings table to average all ratings for each movie.

After this, I use **join()** to join the previous results together: i.e. each row of data should contain the movie's title, its genre, and the user's average rating for it.

Finally, for each genre's movie, use **filter()** to get all the data of movies with this genre, and use the method of the previous question to sort the data in descending order based on the average rating, and get the titles of the top 10.

For simplicity, only the results of the first three gernes are printed.

In [5]:
moviesG = movies.withColumn("genres", functions.explode(functions.split("genres", "\\|")))
ratingsAvg = ratings.groupBy("movieId").avg("rating")
moviesGR = moviesG.join(ratingsAvg, moviesG["movieId"]==
                                 ratingsAvg["movieId"], "left").select("title", "genres", "avg(rating)")

count = 0
for genre in moviesGR.select("genres").distinct().collect():
    print(genre[0])
    moviesGR.filter(moviesGR["genres"] == genre[0]).orderBy(
        functions.desc("avg(rating)")).limit(10).select("title").show(truncate=False)
    count += 1
    if (count == 3): break

Crime
+-------------------------------------------------------+
|title                                                  |
+-------------------------------------------------------+
|Ex Drummer (2007)                                      |
|Villain (1971)                                         |
|Mother (Madeo) (2009)                                  |
|Going Places (Valseuses, Les) (1974)                   |
|12 Angry Men (1997)                                    |
|American Friend, The (Amerikanische Freund, Der) (1977)|
|Sisters (Syostry) (2001)                               |
|Little Murders (1971)                                  |
|Faster (2010)                                          |
|Decalogue, The (Dekalog) (1989)                        |
+-------------------------------------------------------+

Romance
+----------------------------------------------------------------+
|title                                                           |
+--------------------------------------

I first select the "movieId", "userId" columns in the ratings table and the ids of the first 100 movies in the movies table.

Use two for loops to iterate through all movie pairs, the second loop starts from the position already traversed in the first loop to avoid double counting, use **filter()** to find the ratings for the first item and second item in movie pairs respectively, use inner join to find users who rated both items, and use **count()** to calculate the number of users who rated both items.

For simplicity, only the first 5 results of are printed.

In [6]:
movieUserId = ratings.select("movieId", "userId")
frist100MovieId = movies.select("movieId").limit(100).collect()

count = 0
for i, movieId1 in enumerate(frist100MovieId):
    for movieId2 in frist100MovieId[i+1:]:
        userMovieId1 = movieUserId.filter(movieUserId["movieId"]==movieId1[0])
        userMovieId2 = movieUserId.filter(movieUserId["movieId"]==movieId2[0])
        commonSupport = userMovieId1.join(userMovieId2, userMovieId1["userId"]
                                          ==userMovieId2["userId"], "inner").count()
        print("(" + str(movieId1[0]) + "," + str(movieId2[0]) + ") : " + str(commonSupport))
        count += 1
        if (count == 5): break
    break

(1,2) : 68
(1,3) : 32
(1,4) : 2
(1,5) : 32
(1,6) : 58


I used **randomSplit()** to split the data into training set and test set.
Using explicit feedback from **ASL()** to build a recommendation model.
Use the training set to train the model and use the test set to predict user ratings for movies

In [7]:
trainingSet, testSet = ratings.randomSplit([8., 2.])
alsExplicit = ALS(maxIter=5, regParam=0.01, userCol="userId", 
                  itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
modelExplicit = alsExplicit.fit(trainingSet)
predictionsExplicit = modelExplicit.transform(testSet)
predictionsExplicit.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|   148|   4308|   4.0|1482548613|  1.685884|
|   148|   5952|   3.0|1482548769| 2.5508559|
|   148|   8368|   4.0|1482548676| 3.9191563|
|   148|  40629|   5.0|1482548578| 3.3270745|
|   148|  50872|   3.0|1482548504|  3.700086|
|   148|  54001|   4.0|1482548674| 4.3208256|
|   148|  69757|   3.5|1482548519| 4.6425567|
|   148| 110102|   4.0|1482548669|  2.855833|
|   148| 116797|   4.5|1482550073| 2.9059036|
|   148| 152081|   4.0|1482548452|  4.472422|
|   148| 157296|   3.0|1482548671| 5.1432962|
|   463|    527|   4.0|1145460304|  4.421393|
|   463|   1088|   3.5|1145460096| 4.7167683|
|   463|   3977|   2.0|1145459381| 3.8309066|
|   463|  36529|   4.5|1145460284| 4.1542473|
|   471|    296|   4.0|1496671827| 4.2878375|
|   471|   8874|   3.5|1496668982|  5.242627|
|   496|    904|   5.0|1415166605| 3.7519712|
|   496|   2394|   3.5|1415165480|

Evaluate the model based on the root mean squared error of rating predictions

In [9]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol ="prediction")
rmseExplicit = evaluator.evaluate(predictionsExplicit)
print("Explicit:Root-mean-square error = "+str(rmseExplicit))

Explicit:Root-mean-square error = 1.074742200139716


For a user(Id), **getRecommend()** returns the top 5 movies with the highest predictions of ratings, thus completing the recommendation.

In [10]:
def getRecommend(userId):
    recommendIds = predictionsExplicit.filter(predictionsExplicit["userId"]
                                              ==userId).orderBy(functions.desc("prediction")).limit(5)
    
    recommendMovies = recommendIds.join(movies, recommendIds["movieId"]
                                        ==movies["movieId"], "left").select("title")
    
    recommendMovies.show(truncate=False)

In [11]:
getRecommend(5)

+--------------------------------------+
|title                                 |
+--------------------------------------+
|Shadowlands (1993)                    |
|Dances with Wolves (1990)             |
|Usual Suspects, The (1995)            |
|Snow White and the Seven Dwarfs (1937)|
|Once Were Warriors (1994)             |
+--------------------------------------+



In [38]:
mu = trainingSet.select(functions.avg("rating")).collect()[0][0]
userIds = trainingSet.select("userId").distinct().sort("userId")
movieIds = trainingSet.select("movieId").distinct().sort("movieId")
ratings = np.zeros([userIds.tail(1)[0][0], movieIds.tail(1)[0][0]])

# numUsers = trainingSet.select("userId").distinct().count()
# numMovies= trainingSet.select("movieId").distinct().count()
# biases = np.ones([numUsers, numMovies])

In [41]:
movieIds.tail(1)[0][0]

193587