## Movie recomender system using PySpark
under construction...

In [1]:
import numpy as np
import pandas as pd

# spark libs.
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import lower, col
# spark datatypes
from pyspark.sql.types import *

# check Spark UI from 
# http://localhost:4040

### Creating spark session and setting up number of partitions.

In [2]:
# creating spark local machine session.
sc = SparkContext('local')
spark = SparkSession(sc)

# setup 5 partitions since data is not "big-data", and is running locally.
spark.conf.set("spark.sql.shuffle.partitions", "5")

### Reading movie data from CSV files into Spark DataFrames.
We could let Spark <b>infer</b> the schema but in this case we do the <b>explicit</b> datyping as good practice.

In [3]:
# define schemas (DataFrame's Metadata) to load CSV into Spark DataFrame

# USER SCHEMA
# user id | age | gender | occupation | zip code
user_schema =  StructType([
                    StructField("user_id", ShortType(), False), # False = not null
                    StructField("age", ByteType(), True),
                    StructField("gender", StringType(), True),
                    StructField("occupation", StringType(), True),
                    StructField("zip", StringType(), True),
                ])

# GENRE SCHEMA
# genre | genre_id
genre_schema = StructType([
                    StructField("genre", StringType(), True),
                    StructField("genre_id", ShortType(), False), # False = not null
                ])

# MOVIE SCHEMA 
# movie id | movie title | release date | video release date | IMDb URL | unknown | Action | 
# Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy |
# Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western |
movie_schema = StructType([
                    StructField("movie_id", ShortType(), False), # False = not null
                    StructField("title", StringType(), True),
                    StructField("release_date", StringType(), True), # format: 01-Jan-1995
                    StructField("video_date", StringType(), True), # format: 01-Jan-1995
                    StructField("url", StringType(), True),
                    StructField("unknown", ByteType(), True),
                    StructField("action", ByteType(), True),
                    StructField("adventure", ByteType(), True),
                    StructField("animation", ByteType(), True),
                    StructField("children", ByteType(), True),
                    StructField("comedy", ByteType(), True),
                    StructField("crime", ByteType(), True),
                    StructField("documentary", ByteType(), True),
                    StructField("drama", ByteType(), True),
                    StructField("fantasy", ByteType(), True),
                    StructField("film_noir", ByteType(), True),
                    StructField("horror", ByteType(), True),
                    StructField("musical", ByteType(), True),
                    StructField("mystery", ByteType(), True),
                    StructField("romance", ByteType(), True),
                    StructField("sci_fi", ByteType(), True),
                    StructField("thriller", ByteType(), True),
                    StructField("war", ByteType(), True),
                    StructField("western", ByteType(), True),
            ])

# USER-MOVIE RATINGS SCHEMA
# user id | item id | rating | timestamp
rating_schema = StructType([
                    StructField("user_id", ShortType(), False), # False = not null
                    StructField("movie_id", ShortType(), False),
                    StructField("rating", ShortType(), False),
                    StructField("timestamp", StringType(), True),
                    #StructField("timestamp", TimestampType(), True), 
                ])


In [4]:
def read_csv(path, name, schema, delimiter=','):
    """
    Takes in the path, name and schema of the CSV
    Returns a Spark DataFrame.
    """
    
    fullpath = path + name

    # read CSV using the provided schema.
    dataframe = spark.read.format("csv")\
        .schema(schema)\
        .option("header", "false")\
        .option("delimiter", delimiter)\
        .option("mode", "FAILFAST")\
        .load(fullpath)

    return dataframe

# load CSVs into Spark.
user_df = read_csv( path='ml-100k/', name='u.user', schema=user_schema, delimiter='|' )
genre_df = read_csv( path='ml-100k/', name='u.genre', schema=genre_schema, delimiter='|' )
movie_df = read_csv( path='ml-100k/', name='u.item', schema=movie_schema, delimiter='|' )
rating_df = read_csv( path='ml-100k/', name='u.data', schema=rating_schema, delimiter='\t' )

# create Spark tables to enable use of SQL.
user_df.createOrReplaceTempView('user_df')
genre_df.createOrReplaceTempView('genre_df')
movie_df.createOrReplaceTempView('movie_df')
rating_df.createOrReplaceTempView('rating_df')

In [5]:
# checking movie DataFrame datatypes.
movie_df.printSchema()
print(f'Total Movies:{movie_df.count()}')

root
 |-- movie_id: short (nullable = true)
 |-- title: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- video_date: string (nullable = true)
 |-- url: string (nullable = true)
 |-- unknown: byte (nullable = true)
 |-- action: byte (nullable = true)
 |-- adventure: byte (nullable = true)
 |-- animation: byte (nullable = true)
 |-- children: byte (nullable = true)
 |-- comedy: byte (nullable = true)
 |-- crime: byte (nullable = true)
 |-- documentary: byte (nullable = true)
 |-- drama: byte (nullable = true)
 |-- fantasy: byte (nullable = true)
 |-- film_noir: byte (nullable = true)
 |-- horror: byte (nullable = true)
 |-- musical: byte (nullable = true)
 |-- mystery: byte (nullable = true)
 |-- romance: byte (nullable = true)
 |-- sci_fi: byte (nullable = true)
 |-- thriller: byte (nullable = true)
 |-- war: byte (nullable = true)
 |-- western: byte (nullable = true)

Total Movies:1682


In [6]:
# print top 5 rows nicely with show()
user_df.show(5)
print(f'Total Users:{user_df.count()}')

+-------+---+------+----------+-----+
|user_id|age|gender|occupation|  zip|
+-------+---+------+----------+-----+
|      1| 24|     M|technician|85711|
|      2| 53|     F|     other|94043|
|      3| 23|     M|    writer|32067|
|      4| 24|     M|technician|43537|
|      5| 33|     F|     other|15213|
+-------+---+------+----------+-----+
only showing top 5 rows

Total Users:943


In [7]:
# get top 5 genre rows with take()
print(genre_df.take(5))
print(f'\nTotal genres:{genre_df.count()}')

[Row(genre='unknown', genre_id=0), Row(genre='Action', genre_id=1), Row(genre='Adventure', genre_id=2), Row(genre='Animation', genre_id=3), Row(genre="Children's", genre_id=4)]

Total genres:19


In [8]:
# get top movies rows.
print(movie_df.take(2))
print(f'\nTotal movies:{movie_df.count()}')

[Row(movie_id=1, title='Toy Story (1995)', release_date='01-Jan-1995', video_date=None, url='http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', unknown=0, action=0, adventure=0, animation=1, children=1, comedy=1, crime=0, documentary=0, drama=0, fantasy=0, film_noir=0, horror=0, musical=0, mystery=0, romance=0, sci_fi=0, thriller=0, war=0, western=0), Row(movie_id=2, title='GoldenEye (1995)', release_date='01-Jan-1995', video_date=None, url='http://us.imdb.com/M/title-exact?GoldenEye%20(1995)', unknown=0, action=1, adventure=1, animation=0, children=0, comedy=0, crime=0, documentary=0, drama=0, fantasy=0, film_noir=0, horror=0, musical=0, mystery=0, romance=0, sci_fi=0, thriller=1, war=0, western=0)]

Total movies:1682


In [9]:
# print top 5 ratings.
rating_df.show(5)
print(f'\nTotal ratings:{rating_df.count()}')

+-------+--------+------+---------+
|user_id|movie_id|rating|timestamp|
+-------+--------+------+---------+
|    196|     242|     3|881250949|
|    186|     302|     3|891717742|
|     22|     377|     1|878887116|
|    244|      51|     2|880606923|
|    166|     346|     1|886397596|
+-------+--------+------+---------+
only showing top 5 rows


Total ratings:100000


In [10]:
# testing some SQL on a Spark table.
spark.sql(
    """
    SELECT * 
    FROM user_df 
    WHERE occupation = 'lawyer' AND
        age BETWEEN 25 AND 50 AND
        gender = 'M'
    ORDER BY age DESC
    """
).show(10)

+-------+---+------+----------+-----+
|user_id|age|gender|occupation|  zip|
+-------+---+------+----------+-----+
|    161| 50|     M|    lawyer|55104|
|    205| 47|     M|    lawyer|06371|
|    419| 37|     M|    lawyer|43215|
|    339| 35|     M|    lawyer|37901|
|    680| 33|     M|    lawyer|90405|
|    125| 30|     M|    lawyer|22202|
|    365| 29|     M|    lawyer|20009|
|    846| 27|     M|    lawyer|47130|
+-------+---+------+----------+-----+



#### Making some adjustmens to genre names, lowecase and replacing dash for underscore

In [14]:
genre_df = genre_df.withColumn('genre', lower(col('genre')))
genre_df.show()

+-----------+--------+
|      genre|genre_id|
+-----------+--------+
|    unknown|       0|
|     action|       1|
|  adventure|       2|
|  animation|       3|
| children's|       4|
|     comedy|       5|
|      crime|       6|
|documentary|       7|
|      drama|       8|
|    fantasy|       9|
|  film-noir|      10|
|     horror|      11|
|    musical|      12|
|    mystery|      13|
|    romance|      14|
|     sci-fi|      15|
|   thriller|      16|
|        war|      17|
|    western|      18|
+-----------+--------+



In [13]:
# Since Movie table is not normalized to second normal form, we need can fix that to make it easier to query.
# First we create a movie-genre table, which is a many to many relationship.

for genre in genre_df.take(1000):
    print(genre.genre_id, genre.genre)
    
    spark.sql(
        f"""
        SELECT movie_id
        FROM movie_df
        WHERE {genre.genre} = 1
        """
    ).show(10)

0 unknown
+--------+
|movie_id|
+--------+
|     267|
|    1373|
+--------+

1 action
+--------+
|movie_id|
+--------+
|       2|
|       4|
|      17|
|      21|
|      22|
|      24|
|      27|
|      28|
|      29|
|      33|
+--------+
only showing top 10 rows

2 adventure
+--------+
|movie_id|
+--------+
|       2|
|      21|
|      24|
|      29|
|      35|
|      50|
|      62|
|      78|
|      82|
|      97|
+--------+
only showing top 10 rows

3 animation
+--------+
|movie_id|
+--------+
|       1|
|      71|
|      95|
|      99|
|     101|
|     102|
|     103|
|     114|
|     169|
|     189|
+--------+
only showing top 10 rows

4 children's


ParseException: "\nmismatched input ''' expecting <EOF>(line 4, pos 22)\n\n== SQL ==\n\n        SELECT movie_id\n        FROM movie_df\n        WHERE children's = 1\n----------------------^^^\n        \n"