In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, mean, udf, lit, current_timestamp, unix_timestamp, array_contains

import pandas as pd
import math
import mlflow
import os

In [2]:
# instantiate the Spark session

from pyspark.conf import SparkConf

sparkConf = SparkConf()
sparkConf.setAppName("My app").set("spark.jars", "/home/avani/UMass/Fall 2022/CS 532/Project/postgresql-42.5.0.jar")
sparkConf.set("spark.dynamicAllocation.enabled", "true")
sparkConf.set("spark.executor.cores", 8)
sparkConf.set("spark.dynamicAllocation.minExecutors","1")
sparkConf.set("spark.dynamicAllocation.maxExecutors","5000")
sparkConf.set("spark.executor.memory", "32g")
sparkConf.set("spark.ui.port","4050")
sparkConf.set("spark.memory.fraction", 0.7)

spark = SparkSession.builder.master('local[*]').config(conf=sparkConf).getOrCreate()

22/12/04 03:34:06 WARN Utils: Your hostname, avani-HP resolves to a loopback address: 127.0.1.1; using 192.168.0.9 instead (on interface wlo1)
22/12/04 03:34:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/12/04 03:34:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [None]:
'''
In case you want to try running, you can read from csv files directly and skip the next 3 cells.
'''

# movies_df = spark.read.load("/home/avani/UMass/Fall 2022/CS 532/Project/ml-latest/movies.csv", format='csv', header = True)
# ratings_df = spark.read.load("/home/avani/UMass/Fall 2022/CS 532/Project/ml-latest/ratings.csv", format='csv', header = True)
# links_df = spark.read.load("/home/avani/UMass/Fall 2022/CS 532/Project/ml-latest/links.csv", format='csv', header = True)
# tags_df = spark.read.load("/home/avani/UMass/Fall 2022/CS 532/Project/ml-latest/tags.csv", format='csv', header = True)

In [3]:
tables_list = ['movies', 'ratings', 'tags', 'links']

dataframeList = {}

# read each table 
for table in tables_list:

    # parameters specified for ratings table
    if table == 'ratings':
        dataframeList[table] = spark.read.format("jdbc"). \
                                options(
                                 url = 'jdbc:postgresql://localhost:5432/movielens_dataset', # using jdbc:postgresql://<host>:<port>/<database>
                                 dbtable = table + "_new",
                                 user = 'postgres',
                                 password = 'postgres',
                                 driver = 'org.postgresql.Driver',
                                 fetchSize = 1000,
                                 partitionColumn = "userId",
                                 lowerBound = 1,
                                 upperBound = 283228,
                                 numPartitions = 32).\
                                load()
    else:
        dataframeList[table] = spark.read.format("jdbc"). \
                                options(
                                 url = 'jdbc:postgresql://localhost:5432/movielens_dataset', # using jdbc:postgresql://<host>:<port>/<database>
                                 dbtable = table + "_new",
                                 user = 'postgres',
                                 password = 'postgres',
                                 driver = 'org.postgresql.Driver').\
                                load()

In [4]:
for key, value in dataframeList.items():
    print(key + " table")
    print(value.printSchema())

movies table
root
 |-- movieid: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

None
ratings table
root
 |-- userid: integer (nullable = true)
 |-- movieid: integer (nullable = true)
 |-- rating: decimal(38,18) (nullable = true)
 |-- timestamp: integer (nullable = true)

None
tags table
root
 |-- userid: integer (nullable = true)
 |-- movieid: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: integer (nullable = true)

None
links table
root
 |-- movieid: integer (nullable = true)
 |-- imdbid: integer (nullable = true)
 |-- tmdbid: integer (nullable = true)

None


In [5]:
movies_df = dataframeList['movies']
ratings_df = dataframeList['ratings']
links_df = dataframeList['links']
tags_df = dataframeList['tags']

### Show the dataframes and make the lifetime of dataframes same as spark session

In [6]:
movies_df.createOrReplaceTempView("movies_df")
spark.sql("SELECT * FROM movies_df limit 20").show(20)

[Stage 0:>                                                          (0 + 1) / 1]

+-------+--------------------+--------------------+
|movieid|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

                                                                                

In [7]:
ratings_df.createOrReplaceTempView("ratings_df")
spark.sql("SELECT * FROM ratings_df limit 20").show(20)



+------+-------+--------------------+---------+
|userid|movieid|              rating|timestamp|
+------+-------+--------------------+---------+
|     1|      1|4.000000000000000000|964982703|
|     1|      3|4.000000000000000000|964981247|
|     1|      6|4.000000000000000000|964982224|
|     1|     47|5.000000000000000000|964983815|
|     1|     50|5.000000000000000000|964982931|
|     1|     70|3.000000000000000000|964982400|
|     1|    101|5.000000000000000000|964980868|
|     1|    110|4.000000000000000000|964982176|
|     1|    151|5.000000000000000000|964984041|
|     1|    157|5.000000000000000000|964984100|
|     1|    163|5.000000000000000000|964983650|
|     1|    216|5.000000000000000000|964981208|
|     1|    223|3.000000000000000000|964980985|
|     1|    231|5.000000000000000000|964981179|
|     1|    235|4.000000000000000000|964980908|
|     1|    260|5.000000000000000000|964981680|
|     1|    296|3.000000000000000000|964982967|
|     1|    316|3.000000000000000000|964

                                                                                

In [8]:
links_df.createOrReplaceTempView("links_df")
spark.sql("SELECT * FROM links_df limit 20").show(20)

+-------+------+------+
|movieid|imdbid|tmdbid|
+-------+------+------+
|      1|114709|   862|
|      2|113497|  8844|
|      3|113228| 15602|
|      4|114885| 31357|
|      5|113041| 11862|
|      6|113277|   949|
|      7|114319| 11860|
|      8|112302| 45325|
|      9|114576|  9091|
|     10|113189|   710|
|     11|112346|  9087|
|     12|112896| 12110|
|     13|112453| 21032|
|     14|113987| 10858|
|     15|112760|  1408|
|     16|112641|   524|
|     17|114388|  4584|
|     18|113101|     5|
|     19|112281|  9273|
|     20|113845| 11517|
+-------+------+------+



In [9]:
tags_df.createOrReplaceTempView("tags_df")
spark.sql("SELECT * FROM tags_df limit 20").show(20)

+------+-------+-----------------+----------+
|userid|movieid|              tag| timestamp|
+------+-------+-----------------+----------+
|     2|  60756|            funny|1445714994|
|     2|  60756|  Highly quotable|1445714996|
|     2|  60756|     will ferrell|1445714992|
|     2|  89774|     Boxing story|1445715207|
|     2|  89774|              MMA|1445715200|
|     2|  89774|        Tom Hardy|1445715205|
|     2| 106782|            drugs|1445715054|
|     2| 106782|Leonardo DiCaprio|1445715051|
|     2| 106782|  Martin Scorsese|1445715056|
|     7|  48516|     way too long|1169687325|
|    18|    431|        Al Pacino|1462138765|
|    18|    431|         gangster|1462138749|
|    18|    431|            mafia|1462138755|
|    18|   1221|        Al Pacino|1461699306|
|    18|   1221|            Mafia|1461699303|
|    18|   5995|        holocaust|1455735472|
|    18|   5995|       true story|1455735479|
|    18|  44665|     twist ending|1456948283|
|    18|  52604|  Anthony Hopkins|

### Registering the dataframes to spark 

In [10]:
movies_df.registerTempTable("movies")
ratings_df.registerTempTable("ratings")
links_df.registerTempTable("links")
tags_df.registerTempTable("tags")



### Analyse the data

In [11]:
minRating_1 = ratings_df.groupBy("userID").count().toPandas()['count'].min()
minRating_2 = ratings_df.groupBy("movieId").count().toPandas()['count'].min()

print('Minimum number of ratings per user: {}'.format(minRating_1))
print('Minimum number of ratings per movie: {}'.format(minRating_2))

                                                                                

Minimum number of ratings per user: 20
Minimum number of ratings per movie: 1




In [12]:
_rating1 = sum(ratings_df.groupBy("movieId").count().toPandas()['count'] == 1)
_total = ratings_df.select('movieId').distinct().count()

print('movies are rated by only one user: {} out of {} '.format(_rating1, _total))

movies are rated by only one user: 3446 out of 9724 


In [13]:
# number of distinct users
num_users = spark.sql("SELECT count (distinct userID) as num_users FROM ratings")
ratings_df.select("userId").distinct().count()



610

In [14]:
# number of movies
num_movies = spark.sql("SELECT count (distinct movieID) as num_movies FROM movies")
print(movies_df.select('movieID').distinct().count())

9742


In [15]:
rated_by_users = ratings_df.select('movieID').distinct().count()
print('Total Number of movies rated by users:', rated_by_users)

Total Number of movies rated by users: 9724


In [16]:
# null rated movies
spark.sql("SELECT movies.title, movies.genres ,ratings.rating FROM movies left JOIN ratings ON ratings.movieId = movies.movieID WHERE ratings.rating IS null LIMIT 10").show()



+--------------------+--------------------+------+
|               title|              genres|rating|
+--------------------+--------------------+------+
|Color of Paradise...|               Drama|  null|
|      Niagara (1953)|      Drama|Thriller|  null|
|        Proof (1991)|Comedy|Drama|Romance|  null|
|Road Home, The (W...|       Drama|Romance|  null|
|Parallax View, Th...|            Thriller|  null|
|Mutiny on the Bou...|Adventure|Drama|R...|  null|
|Browning Version,...|               Drama|  null|
|Twentieth Century...|              Comedy|  null|
|In the Realms of ...|Animation|Documen...|  null|
|      Scrooge (1970)|Drama|Fantasy|Mus...|  null|
+--------------------+--------------------+------+



In [17]:
# movie genres
spark.sql("SELECT DISTINCT(genres) FROM movies LIMIT 10").show()

+--------------------+
|              genres|
+--------------------+
|Comedy|Horror|Thr...|
|Adventure|Sci-Fi|...|
|Action|Adventure|...|
| Action|Drama|Horror|
|Action|Animation|...|
|Animation|Childre...|
|Action|Adventure|...|
|    Adventure|Sci-Fi|
|Documentary|Music...|
|Adventure|Childre...|
+--------------------+



In [18]:
extract_genres = udf(lambda x: x.split("|"), ArrayType(StringType()))
movies_df_clean = movies_df.select("movieId", "title", extract_genres("genres").alias("genres"))

movies_df_clean.createOrReplaceTempView("movies_df_clean")

display (spark.sql("SELECT * FROM movies_df_clean limit 5"))

DataFrame[movieId: int, title: string, genres: array<string>]

In [19]:
movies_df_clean.show(5)

[Stage 57:>                                                         (0 + 1) / 1]

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|[Adventure, Anima...|
|      2|      Jumanji (1995)|[Adventure, Child...|
|      3|Grumpier Old Men ...|   [Comedy, Romance]|
|      4|Waiting to Exhale...|[Comedy, Drama, R...|
|      5|Father of the Bri...|            [Comedy]|
+-------+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [20]:
# All movie categories
genres_result = list(set(movies_df_clean.select('genres').rdd.flatMap(tuple).flatMap(tuple).collect()))
genres_result

                                                                                

['Comedy',
 'Adventure',
 'Mystery',
 'Musical',
 'Sci-Fi',
 'War',
 'Crime',
 'Western',
 '(no genres listed)',
 'Children',
 'Documentary',
 'Horror',
 'Film-Noir',
 'Fantasy',
 'Drama',
 'Romance',
 'Action',
 'IMAX',
 'Animation',
 'Thriller']

In [21]:
movie_pdf = movies_df.toPandas()
list_of_movie = list(movie_pdf['title'])

## Recommender

In [22]:
# Data type convert
movie_ratings = ratings_df.drop('timestamp')

movie_ratings = movie_ratings.withColumn("userId", movie_ratings["userId"].cast(IntegerType()))
movie_ratings = movie_ratings.withColumn("movieId", movie_ratings["movieId"].cast(IntegerType()))
movie_ratings = movie_ratings.withColumn("rating", movie_ratings["rating"].cast(FloatType()))

In [23]:
movie_ratings.show(20)
movie_ratings.createOrReplaceTempView("movie_ratings")
display (spark.sql("SELECT * FROM movie_ratings limit 10"))

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
|     1|    163|   5.0|
|     1|    216|   5.0|
|     1|    223|   3.0|
|     1|    231|   5.0|
|     1|    235|   4.0|
|     1|    260|   5.0|
|     1|    296|   3.0|
|     1|    316|   3.0|
|     1|    333|   5.0|
|     1|    349|   4.0|
+------+-------+------+
only showing top 20 rows



DataFrame[userId: int, movieId: int, rating: float]

In [24]:
movie_rating_sample = movie_ratings.sample(False, 1/500)
movie_rating_sample.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     5|    253|   3.0|
|    18|   7153|   4.5|
|    19|    366|   2.0|
|    32|    529|   4.0|
|    32|   1393|   4.0|
|    41|   6287|   5.0|
|    42|    780|   2.0|
|    42|   2997|   4.0|
|    47|  53894|   3.5|
|    51|   2133|   3.5|
|    57|   2640|   4.0|
|    57|   2797|   3.0|
|    58|    440|   4.0|
|    61|    173|   3.0|
|    61|   1214|   4.5|
|    62|  51540|   4.5|
|    63|    231|   3.0|
|    64|    223|   3.0|
|    64|   1258|   3.5|
|    68|    531|   2.5|
+------+-------+------+
only showing top 20 rows



In [45]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [26]:
# Create ALS model
als = ALS(
         userCol="userId", 
         itemCol="movieId",
         ratingCol="rating", 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop"
)

In [27]:
# split the data
(trainData, testData) = movie_ratings.randomSplit([0.8,0.2])

In [28]:
# create parameter grid
paramGrid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .addGrid(als.maxIter, [15]) \
            .build()

In [29]:
evaluator = RegressionEvaluator(
               metricName="rmse", 
               labelCol="rating", 
               predictionCol="prediction")

print ("Num models to be tested: ", len(paramGrid))

Num models to be tested:  16


In [30]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

In [46]:
# if model exists, use that
if os.path.exists("bestALSModel"):
    print("Existing model found!")
    print("Loading...")
    bestALSModel = ALSModel.load('bestALSModel')

# else start training    
else:
    print("Existing model not found :(")
    print("Starting training...")
    cvModel = cv.fit(trainData)
    bestALSModel = cvModel.bestModel
    
# Getting the best model and its RMS Error
testPredictions = bestALSModel.transform(testData)
rmse = evaluator.evaluate(testPredictions)
print("RMS Error =",rmse)

Existing model found!
Loading...
RMS Error = 0.8719822097589555




In [32]:
print ("Best Model Parameters")
print ("Rank: ", bestALSModel._java_obj.parent().getRank())
print ("MaxIter: ", str(bestALSModel._java_obj.parent().getMaxIter()))
print ("RegParam:",  bestALSModel._java_obj.parent().getRegParam())

Best Model Parameters
Rank:  50
MaxIter:  15
RegParam: 0.15


In [33]:
# Write model to disk

if not os.path.exists("bestALSModel"):
    bestALSModel.write().save("bestALSModel")

[Stage 12416:>                                                     (0 + 8) / 10]

22/12/04 04:00:17 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
22/12/04 04:00:17 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
22/12/04 04:00:18 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers


                                                                                

22/12/04 04:00:18 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
22/12/04 04:00:18 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
22/12/04 04:00:18 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers


### Model testing

In [34]:
# Test Set Predictions 
testPredictions.createOrReplaceTempView("predictions")
spark.sql("SELECT * FROM predictions limit 10").show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   463|   1088|   3.5| 3.2851524|
|   580|  44022|   3.5| 3.2771707|
|   597|    471|   2.0| 3.8366563|
|   597|   1580|   3.0|  3.569401|
|   368|   1580|   3.0|  2.927928|
|   368|   2366|   4.0| 3.1160734|
|   368|   3918|   2.0| 2.6714442|
|    28|   3175|   1.5| 2.8194048|
|   587|   3175|   5.0| 3.8273165|
|   332|   2366|   3.5| 3.4877331|
+------+-------+------+----------+



In [47]:
data = bestALSModel.transform(movie_ratings)
rmse = evaluator.evaluate(data)
print ("Error is",rmse)
data.registerTempTable("data")



Error is 0.6646496588537723


                                                                                

In [36]:
Top10Recs = bestALSModel.recommendForAllUsers(10)
Top10Recs.createOrReplaceTempView("Top10Recs")
spark.sql("SELECT * FROM Top10Recs limit 10").show()
Top10Recs.registerTempTable("Top_10_Recommendations")



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{170355, 5.78263...|
|     3|[{6835, 4.851414}...|
|     5|[{8477, 4.868637}...|
|     6|[{6732, 4.8381457...|
|     9|[{8477, 4.7684717...|
|    12|[{5867, 5.6364326...|
|    13|[{170355, 4.92563...|
|    15|[{7842, 4.493137}...|
|    16|[{96004, 4.505365...|
|    17|[{96004, 5.162398...|
+------+--------------------+



                                                                                

In [37]:
# separate the value of 'recommendations'
unwrapRecommendations = spark.sql('SELECT userId, explode(recommendations) AS MovieRec FROM Top_10_Recommendations')
unwrapRecommendations.createOrReplaceTempView("unwrapRecs")
spark.sql("SELECT * FROM unwrapRecs limit 10").show()
finalRecommendations = spark.sql("SELECT userId,movieIds_and_ratings.movieId AS movieId,\
                        movieIds_and_ratings.rating AS prediction\
                        FROM Top_10_Recommendations\
                        LATERAL VIEW explode(recommendations) exploded_table AS movieIds_and_ratings")
finalRecommendations.createOrReplaceTempView("Recommendations")
spark.sql("SELECT * FROM Recommendations limit 10").show()

                                                                                

+------+-------------------+
|userId|           MovieRec|
+------+-------------------+
|     1|{170355, 5.7826343}|
|     1| {96004, 5.7826343}|
|     1|  {3379, 5.7826343}|
|     1|  {33649, 5.654786}|
|     1| {132333, 5.572264}|
|     1|   {5490, 5.572264}|
|     1|  {7842, 5.5654945}|
|     1|{171495, 5.5202937}|
|     1| {78836, 5.4555316}|
|     1| {117531, 5.451997}|
+------+-------------------+





+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|     1| 170355| 5.7826343|
|     1|  96004| 5.7826343|
|     1|   3379| 5.7826343|
|     1|  33649|  5.654786|
|     1| 132333|  5.572264|
|     1|   5490|  5.572264|
|     1|   7842| 5.5654945|
|     1| 171495| 5.5202937|
|     1|  78836| 5.4555316|
|     1| 117531|  5.451997|
+------+-------+----------+



                                                                                

### Prediction of Users who haven't seen the Movie

In [39]:
# Left join to get the movies which have not been reviewed
finalRecs = finalRecommendations.join(movie_ratings,['userId','movieId'],'left') \
                                .filter(movie_ratings.rating.isNull())
finalRecs.createOrReplaceTempView("final_Recommendations")
spark.sql("SELECT * FROM final_Recommendations limit 20").show()



+------+-------+----------+------+
|userId|movieId|prediction|rating|
+------+-------+----------+------+
|     9|   4495| 4.6397157|  null|
|   120|   3379|  4.609119|  null|
|   133| 177593| 3.6991415|  null|
|   137| 132333| 4.5028896|  null|
|   218|  96004|  4.610119|  null|
|   309|   7842| 4.4007373|  null|
|   367|   6732| 5.0505524|  null|
|   372| 177593| 4.2397866|  null|
|   530|  25906| 4.6744313|  null|
|   596| 171495| 4.4235716|  null|
|   604| 134796|  4.806554|  null|
|   122| 170355|  5.929523|  null|
|   193|  89904| 4.4845576|  null|
|   321| 117531| 4.5056376|  null|
|   358| 170355| 4.5756516|  null|
|   451|   5915|  4.921042|  null|
|   481| 170355|  3.910033|  null|
|   545|   3379|  4.792139|  null|
|    20|   3379| 4.9206986|  null|
|    93|   7842|  5.554637|  null|
+------+-------+----------+------+



                                                                                

In [40]:
finalRecs.registerTempTable("final_Recommendations")
movies_df.registerTempTable("movies_df")

### Recommending Movies to certain users

User IDs to recommend:

userID = 37

userID = 436

In [41]:
spark.sql("SELECT DISTINCT(userID) FROM final_Recommendations limit 200").show(n=200)

                                                                                

+------+
|userID|
+------+
|   471|
|   496|
|   148|
|   463|
|   243|
|   540|
|   392|
|    31|
|   516|
|   137|
|   451|
|   251|
|   580|
|    85|
|    65|
|   458|
|   481|
|   588|
|   255|
|    53|
|   133|
|   296|
|   472|
|   322|
|   513|
|    78|
|   321|
|   362|
|   597|
|   593|
|   375|
|   108|
|   155|
|   530|
|   193|
|   368|
|    34|
|   211|
|   101|
|   115|
|   126|
|   385|
|    81|
|   183|
|   210|
|   436|
|    28|
|   596|
|   497|
|   412|
|   300|
|   406|
|   587|
|    76|
|   577|
|    27|
|   332|
|    26|
|   501|
|   384|
|   606|
|   192|
|    44|
|   271|
|   159|
|   253|
|   460|
|   236|
|   329|
|   103|
|   350|
|   336|
|    12|
|   223|
|   602|
|   417|
|   548|
|   388|
|   578|
|    91|
|   409|
|   333|
|   285|
|   601|
|   222|
|   372|
|   604|
|   330|
|   128|
|   209|
|    22|
|   122|
|   493|
|   230|
|   319|
|    93|
|   157|
|   225|
|   232|
|   233|
|   190|
|   367|
|   539|
|   346|
|   246|
|   476|
|   360|
|   599|
|

In [42]:
spark.sql("SELECT userId, title \
            FROM final_Recommendations t1 \
            LEFT JOIN movies_df t2 \
            ON t1.movieId = t2.movieId \
            WHERE t1.userId=37 LIMIT 10").show()



+------+--------------------+
|userId|               title|
+------+--------------------+
|    37|  The Big Bus (1976)|
|    37|Mulholland Dr. (1...|
|    37| On the Beach (1959)|
|    37|         Seve (2014)|
|    37|Dragon Ball Z: Th...|
|    37|    Jetée, La (1962)|
|    37|              Cosmos|
|    37|         Dune (2000)|
|    37|  Saving Face (2004)|
|    37|Man Bites Dog (C'...|
+------+--------------------+



In [43]:
spark.sql("SELECT userId, title \
            FROM final_Recommendations t1 \
            LEFT JOIN movies_df t2 \
            ON t1.movieId = t2.movieId \
            WHERE t1.userId=436 \
            LIMIT 10").show()

[Stage 13473:(98 + 2) / 100][Stage 13474:(0 + 6) / 32][Stage 13475:>(0 + 0) / 1][Stage 13474:====>        (11 + 8) / 32][Stage 13475:>              (0 + 0) / 1]

+------+--------------------+
|userId|               title|
+------+--------------------+
|   436|        Thief (1981)|
|   436|Hello, Dolly! (1969)|
|   436|  The Big Bus (1976)|
|   436|Mulholland Dr. (1...|
|   436| On the Beach (1959)|
|   436|         Seve (2014)|
|   436|Dragon Ball Z: Th...|
|   436|Babes in Toyland ...|
|   436|  Saving Face (2004)|
|   436|Shall We Dance (1...|
+------+--------------------+



[Stage 13475:>                                                      (0 + 1) / 1]                                                                                