# Movie Recommendation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
%matplotlib inline

import mlflow

In [2]:
import os

In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import col, mean, udf, lit, current_timestamp, unix_timestamp, array_contains

## Converting Data to RDD

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("moive recommender") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

22/11/19 18:19:00 WARN Utils: Your hostname, sahil-dell resolves to a loopback address: 127.0.1.1; using 192.168.0.13 instead (on interface wlp2s0)
22/11/19 18:19:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/19 18:19:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Load the dataframes

In [5]:
movies_df = spark.read.load("/home/sahil/Desktop/Fall 22/CS 532/project/ml-latest/movies.csv", format='csv', header = True)
ratings_df = spark.read.load("/home/sahil/Desktop/Fall 22/CS 532/project/ml-latest/ratings.csv", format='csv', header = True)
links_df = spark.read.load("/home/sahil/Desktop/Fall 22/CS 532/project/ml-latest/links.csv", format='csv', header = True)
tags_df = spark.read.load("/home/sahil/Desktop/Fall 22/CS 532/project/ml-latest/tags.csv", format='csv', header = True)

In [25]:
type(movies_df)

pyspark.sql.dataframe.DataFrame

### Show the dataframes and make the lifetime of dataframes sames as spark session

In [6]:
movies_df.show(5)

movies_df.createOrReplaceTempView("movies_df")

display (spark.sql("SELECT * FROM movies_df limit 5"))

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



DataFrame[movieId: string, title: string, genres: string]

In [7]:
ratings_df.show(5)
ratings_df.createOrReplaceTempView("ratings_df")
display (spark.sql("SELECT * FROM ratings_df limit 5"))

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    307|   3.5|1256677221|
|     1|    481|   3.5|1256677456|
|     1|   1091|   1.5|1256677471|
|     1|   1257|   4.5|1256677460|
|     1|   1449|   4.5|1256677264|
+------+-------+------+----------+
only showing top 5 rows



DataFrame[userId: string, movieId: string, rating: string, timestamp: string]

In [8]:
links_df.show(5)

links_df.createOrReplaceTempView("links_df")

display (spark.sql("SELECT * FROM links_df limit 5"))

+-------+-------+------+
|movieId| imdbId|tmdbId|
+-------+-------+------+
|      1|0114709|   862|
|      2|0113497|  8844|
|      3|0113228| 15602|
|      4|0114885| 31357|
|      5|0113041| 11862|
+-------+-------+------+
only showing top 5 rows



DataFrame[movieId: string, imdbId: string, tmdbId: string]

In [9]:
tags_df.show(5)

tags_df.createOrReplaceTempView("tags_df")

display (spark.sql("SELECT * FROM tags_df limit 5"))

+------+-------+------------+----------+
|userId|movieId|         tag| timestamp|
+------+-------+------------+----------+
|    14|    110|        epic|1443148538|
|    14|    110|    Medieval|1443148532|
|    14|    260|      sci-fi|1442169410|
|    14|    260|space action|1442169421|
|    14|    318|imdb top 250|1442615195|
+------+-------+------------+----------+
only showing top 5 rows



DataFrame[userId: string, movieId: string, tag: string, timestamp: string]

### Registering the dataframes to spark 

In [10]:
movies_df.registerTempTable("movies")
ratings_df.registerTempTable("ratings")
links_df.registerTempTable("links")
tags_df.registerTempTable("tags")



### Analyse the data

In [11]:
minRating_1 = ratings_df.groupBy("userID").count().toPandas()['count'].min()
minRating_2 = ratings_df.groupBy("movieId").count().toPandas()['count'].min()

print('Minimum number of ratings per user: {}'.format(minRating_1))
print('Minimum number of ratings per movie: {}'.format(minRating_2))

                                                                                

Minimum number of ratings per user: 1
Minimum number of ratings per movie: 1


In [12]:
_rating1 = sum(ratings_df.groupBy("movieId").count().toPandas()['count'] == 1)
_total = ratings_df.select('movieId').distinct().count()

print('movies are rated by only one user: {} out of {} '.format(_rating1, _total))



movies are rated by only one user: 10155 out of 53889 


                                                                                

In [13]:
# number of distinct users
num_users = spark.sql("SELECT count (distinct userID) as num_users FROM ratings")
display(num_users)
ratings_df.select("userId").distinct().count()

DataFrame[num_users: bigint]

                                                                                

283228

In [14]:
# number of movies
num_movies = spark.sql("SELECT count (distinct movieID) as num_movies FROM movies")
display(num_movies)
movies_df.select('movieID').distinct().count()

DataFrame[num_movies: bigint]

58098

In [15]:
rated_by_users = ratings_df.select('movieID').distinct().count()
print('How many movies are rated by users?', rated_by_users)



How many movies are rated by users? 53889


                                                                                

In [16]:
# null rated movies
spark.sql("SELECT movies.title, movies.genres ,ratings.rating FROM movies left JOIN ratings ON ratings.movieId = movies.movieID WHERE ratings.rating IS null LIMIT 10").show()

[Stage 45:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------+
|               title|              genres|rating|
+--------------------+--------------------+------+
|   Fambul Tok (2011)|         Documentary|  null|
|Shadow Boxers (1999)|         Documentary|  null|
| 9500 Liberty (2009)|         Documentary|  null|
|  Ascent, The (1994)|       Adventure|War|  null|
|Laffghanistan: Co...|Comedy|Documentar...|  null|
|      Annie O (1996)|       Drama|Romance|  null|
|Cry in the Night,...|      Drama|Thriller|  null|
|Across the Sierra...|Action|Romance|We...|  null|
|Bachelor Bait (1934)|      Comedy|Romance|  null|
|Back in the Saddl...|Action|Drama|Western|  null|
+--------------------+--------------------+------+



                                                                                

In [17]:
# all movie genres
spark.sql("SELECT DISTINCT(genres) FROM movies LIMIT 10").show()

[Stage 46:>                                                         (0 + 1) / 1]

+--------------------+
|              genres|
+--------------------+
|Comedy|Horror|Thr...|
|Adventure|Sci-Fi|...|
|Action|Adventure|...|
| Action|Drama|Horror|
|Comedy|Drama|Horr...|
|Action|Animation|...|
|Fantasy|Musical|M...|
|Adventure|Mystery...|
|Children|Comedy|D...|
|Action|Adventure|...|
+--------------------+



                                                                                

In [18]:
extract_genres = udf(lambda x: x.split("|"), ArrayType(StringType()))
movies_df_clean = movies_df.select("movieId", "title", extract_genres("genres").alias("genres"))

movies_df_clean.createOrReplaceTempView("movies_df_clean")

display (spark.sql("SELECT * FROM movies_df_clean limit 5"))

DataFrame[movieId: string, title: string, genres: array<string>]

In [19]:
movies_df_clean.show(5)

[Stage 49:>                                                         (0 + 1) / 1]

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|[Adventure, Anima...|
|      2|      Jumanji (1995)|[Adventure, Child...|
|      3|Grumpier Old Men ...|   [Comedy, Romance]|
|      4|Waiting to Exhale...|[Comedy, Drama, R...|
|      5|Father of the Bri...|            [Comedy]|
+-------+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [20]:
# All movie categories
genres_result = list(set(movies_df_clean.select('genres').rdd.flatMap(tuple).flatMap(tuple).collect()))
genres_result

                                                                                

['Western',
 'IMAX',
 'Crime',
 'Film-Noir',
 'Comedy',
 'Action',
 'Fantasy',
 'Drama',
 'Sci-Fi',
 'Mystery',
 'Musical',
 'Children',
 '(no genres listed)',
 'War',
 'Documentary',
 'Horror',
 'Adventure',
 'Animation',
 'Romance',
 'Thriller']

In [21]:
movie_pdf = movies_df.toPandas()
list_of_movie = list(movie_pdf['title'])

In [22]:
movie_ratings=ratings_df.drop('timestamp')

In [23]:
# Data type convert
from pyspark.sql.types import IntegerType, FloatType
movie_ratings = movie_ratings.withColumn("userId", movie_ratings["userId"].cast(IntegerType()))
movie_ratings = movie_ratings.withColumn("movieId", movie_ratings["movieId"].cast(IntegerType()))
movie_ratings = movie_ratings.withColumn("rating", movie_ratings["rating"].cast(FloatType()))

In [24]:
movie_ratings.show(10)
movie_ratings.createOrReplaceTempView("movie_ratings")
display (spark.sql("SELECT * FROM movie_ratings limit 10"))

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|    307|   3.5|
|     1|    481|   3.5|
|     1|   1091|   1.5|
|     1|   1257|   4.5|
|     1|   1449|   4.5|
|     1|   1590|   2.5|
|     1|   1591|   1.5|
|     1|   2134|   4.5|
|     1|   2478|   4.0|
|     1|   2840|   3.0|
+------+-------+------+
only showing top 10 rows



DataFrame[userId: int, movieId: int, rating: float]