### Step 1: Data Loading and Preparation

In [1]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

sc = SparkContext.getOrCreate()

if (sc is None):
    sc = SparkContext(master="local[*]", appName="Collaborative Filtering")
spark = SparkSession(sparkContext=sc)

lines = spark.read.text("sample_movielens_ratings.txt").rdd

parts = lines.map(lambda row: row.value.split("::"))

ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=float(p[3])))
ratings = spark.createDataFrame(ratingsRDD)

ratings.show()

+------+-------+------+-------------+
|userId|movieId|rating|    timestamp|
+------+-------+------+-------------+
|     0|      2|   3.0|1.424380312E9|
|     0|      3|   1.0|1.424380312E9|
|     0|      5|   2.0|1.424380312E9|
|     0|      9|   4.0|1.424380312E9|
|     0|     11|   1.0|1.424380312E9|
|     0|     12|   2.0|1.424380312E9|
|     0|     15|   1.0|1.424380312E9|
|     0|     17|   1.0|1.424380312E9|
|     0|     19|   1.0|1.424380312E9|
|     0|     21|   1.0|1.424380312E9|
|     0|     23|   1.0|1.424380312E9|
|     0|     26|   3.0|1.424380312E9|
|     0|     27|   1.0|1.424380312E9|
|     0|     28|   1.0|1.424380312E9|
|     0|     29|   1.0|1.424380312E9|
|     0|     30|   1.0|1.424380312E9|
|     0|     31|   1.0|1.424380312E9|
|     0|     34|   1.0|1.424380312E9|
|     0|     37|   1.0|1.424380312E9|
|     0|     41|   2.0|1.424380312E9|
+------+-------+------+-------------+
only showing top 20 rows



### Step 2: Machine Learning Pipeline

In [2]:
(training, test) = ratings.randomSplit([0.8, 0.2],seed =100)

# Build the recommendation model using ALS on the training data
# Cold start strategy is set to 'drop' to ensure we don't get NaN (Not a Number) evaluation metrics

als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data for predicting rating
predictions = model.transform(test)

### Step 3: Evaluation

In [3]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.766045724826976


### Step 4: Making Movie Recommendations

In [4]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10).show(truncate = False)

+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                          |
+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|28    |[[40, 5.4707775], [92, 4.9738865], [51, 4.714526], [49, 3.9794376], [18, 3.8451655], [55, 3.7678719], [26, 3.6868691], [82, 3.624955], [30, 3.5945625], [2, 3.4748125]]  |
|26    |[[51, 7.551892], [90, 5.8758507], [80, 5.651181], [88, 5.2798653], [92, 5.16809], [23, 5.0939965], [22, 5.002135], [94, 4.960957], [7, 4.9602623], [38, 4.8586726]]      |
|27    |[[18, 3.9849849], [74, 3.2772834], [13, 3.2210474], [27, 3.2173297], [79, 3.1399324], [80, 3.0479

In [5]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10).show(truncate = False)

+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|movieId|recommendations                                                                                                                                                         |
+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|31     |[[12, 3.9432912], [14, 3.017304], [8, 2.9304905], [6, 2.8512523], [7, 2.732638], [22, 2.2150097], [3, 2.1798413], [25, 2.1441605], [9, 2.0839198], [23, 2.0378706]]     |
|85     |[[16, 5.0969577], [8, 4.7884884], [10, 4.172562], [14, 4.026385], [9, 3.9943254], [26, 3.8810866], [7, 3.7560155], [24, 3.7509367], [6, 3.4813437], [22, 3.057455]]     |
|65     |[[23, 2.9609385], [5, 2.0597718], [15, 1.993078], [12, 1.8721346], [8, 1.8677015], [3, 1.8656199

In [29]:
from pyspark.sql.functions import udf
from pyspark.sql.functions import col
def recommendedArtists(als_model,user_id,limit):
    #get the recommendations
    test = als_model.recommendForAllUsers(limit).filter(col('userid')==user_id).select("recommendations").collect() 
    #create a dataframe for the top artist list
    top_artist = spark.createDataFrame(test[0][0])
    #join the top_artist dataframe with the artist master dataframe to include the artist_name
    final = top_artist.join(ratings,  top_artist.movieId == ratings.movieId, how ='inner')\
                .select(top_artist.movieId)
    #        .select(top_artist.movieId, top_artist.rating.alias("TR"), ratings.movieId,ratings.rating.alias("RR") )
    return final

In [31]:
recommendedArtists(model,17,2).show(truncate=False)

+-------+
|movieId|
+-------+
|34     |
|34     |
|34     |
|34     |
|34     |
|34     |
|34     |
|34     |
|34     |
|34     |
|34     |
|46     |
|46     |
|46     |
|46     |
|46     |
|46     |
|46     |
|46     |
|46     |
+-------+
only showing top 20 rows



In [16]:
test = model.recommendForAllUsers(20).filter(col('userid')==18).select("recommendations").collect()

In [17]:
test

[Row(recommendations=[Row(movieId=90, rating=6.760019779205322), Row(movieId=28, rating=4.902186393737793), Row(movieId=38, rating=4.818648338317871), Row(movieId=39, rating=4.686800479888916), Row(movieId=75, rating=4.310618877410889), Row(movieId=71, rating=4.0414934158325195), Row(movieId=54, rating=4.0025634765625), Row(movieId=11, rating=3.983522891998291), Row(movieId=44, rating=3.9122049808502197), Row(movieId=33, rating=3.641186475753784), Row(movieId=19, rating=3.2284750938415527), Row(movieId=81, rating=3.1808290481567383), Row(movieId=69, rating=3.1508402824401855), Row(movieId=66, rating=3.083848476409912), Row(movieId=76, rating=3.0317230224609375), Row(movieId=53, rating=2.9207379817962646), Row(movieId=83, rating=2.898651361465454), Row(movieId=94, rating=2.867443084716797), Row(movieId=8, rating=2.850247383117676), Row(movieId=80, rating=2.8122994899749756)])]