In [1]:
APP_NAME = 'MovieLens'

import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName(APP_NAME)\
    .getOrCreate()

## About the Data Set we are using

This dataset (__ml-latest-small__) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.  

It is called 'small', but that only is because it is a subset of a much larger (27 million) dataset. Small in this context means:  
_Small:_ 100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users.

The version I am using to generate these examples was last updated 9/2018.  




In [2]:
def display_df(df, rows=5, transpose=False):
    """Helper function that 'pretty prints' the first 'x' rows of a Spark Dataframe.
    
    This function is helpful when exploring data. PySpark does not generally display
    the content of a DataFrame in a very readable fashion. Using pandas and IPython
    this can be worked around.
    
    This function works by first taking 'x' number of rows of the data from the PySpark
    DataFrame and casting it to a pandas DataFrame. It is then (optionally) transposed, 
    and fed into the display function of IPython. Jupyter then outputs the result as a 
    pretty formatted, human-readable table.
    
    Depends on IPython display package. Necessary imports:
    - from IPython.display import display
    
    Parameters
    ----------
    df : pyspark.sql.DataFrame
        Spark DataFrame that you want to display.
    
    rows
        Number of rows you wish to display should either be an int value between 1 and 100
        Can also be set to 'all', upon which all the rows of the dataframe will be used 
        while still maintaining a maximum of 1000 rows.
        Default value is 20.
    
    transpose : bool
        Allows toggling if to transpose the data or not. Default is False.
        
    Returns
    -------
    None
    """
    from IPython.display import display
    
    # Parsing rows value to set value of 'x'
    x = 1
    if isinstance(rows, str):
        # 'all' is for dynamically determining the value of 'x'
        if rows.lower() == 'all':
            x = df.count()
    else:
        x = rows
    
    # Minimum value of 'x' is 1, Maximum is 100
    if x < 1:  # We need at least one row to display, minimum is 1
        x = 1
    
    elif x > 100:  # Max 100 rows, to avoid overloading the display
        x = 100
    
    print('- displaying %s rows' % x)
    
    # Cast 'x' number of rows from Spark DataFrame to Pandas
    p_df = df.limit(x).toPandas()
    
    # Optionally transposes the data (based on 'transpose' variable)
    if transpose:
        p_df = p_df.transpose()
    
    # Have Jupyter display the dataframe
    display(p_df)

Links Data File Structure (links.csv)
---------------------------------------

Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format:

    movieId,imdbId,tmdbId

movieId is an identifier for movies used by <https://movielens.org>. E.g., the movie Toy Story has the link <https://movielens.org/movies/1>.

imdbId is an identifier for movies used by <http://www.imdb.com>. E.g., the movie Toy Story has the link <http://www.imdb.com/title/tt0114709/>.

tmdbId is an identifier for movies used by <https://www.themoviedb.org>. E.g., the movie Toy Story has the link <https://www.themoviedb.org/movie/862>.

Use of the resources listed above is subject to the terms of each provider.

In [10]:
# Locations of the csv data file
links_csv = '../../data/ml-latest-small/links.csv'

# pyspark.sql.DataFrameReader.read.csv is used for reading csv files
# more information can be found here: https://spark.apache.org/docs/2.4.3/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader
links_df = spark.read.csv(path=links_csv, sep=',', encoding='UTF-8', quote='"', header=True, inferSchema=True)

print('links.csv')
display_df(links_df)
display_df(links_df.summary(), 'all')

links.csv
- displaying 5 rows


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


- displaying 8 rows


Unnamed: 0,summary,movieId,imdbId,tmdbId
0,count,9742.0,9742.0,9734.0
1,mean,42200.353623485935,677183.8981728598,55162.123792890896
2,stddev,52160.49485443825,1107227.5767597635,93653.48148734072
3,min,1.0,417.0,2.0
4,25%,3248.0,95179.0,9665.0
5,50%,7299.0,167260.0,16523.0
6,75%,193609.0,8391976.0,525662.0
7,max,193609.0,8391976.0,525662.0


Movies Data File Structure (movies.csv)
---------------------------------------

Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format:

    movieId,title,genres

Movie titles are entered manually or imported from <https://www.themoviedb.org/>, and include the year of release in parentheses. Errors and inconsistencies may exist in these titles.

Genres are a pipe-separated list, and are selected from the following:

* Action
* Adventure
* Animation
* Children's
* Comedy
* Crime
* Documentary
* Drama
* Fantasy
* Film-Noir
* Horror
* Musical
* Mystery
* Romance
* Sci-Fi
* Thriller
* War
* Western
* (no genres listed)

In [11]:
# Locations of the csv data file
movies_csv = '../data/MovieLens-latest-small/movies.csv'

# pyspark.sql.DataFrameReader.read.csv is used for reading csv files
# more information can be found here: https://spark.apache.org/docs/2.4.3/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader
movies_df = spark.read.csv(path=movies_csv, sep=',', encoding='UTF-8', quote='"', header=True)

print('movies.csv')
movies_df.printSchema()
display_df(movies_df)
display_df(movies_df.summary())


AnalysisException: 'Path does not exist: file:/home/jovyan/mastering-pyspark-ml/data/MovieLens-latest-small/movies.csv;'

Ratings Data File Structure (ratings.csv)
-----------------------------------------

All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:

    userId,movieId,rating,timestamp

The lines within this file are ordered first by userId, then, within user, by movieId.

Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).

Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.

In [6]:
# Locations of the csv data file
ratings_csv = '../data/MovieLens-latest-small/ratings.csv'

# pyspark.sql.DataFrameReader.read.csv is used for reading csv files
# more information can be found here: https://spark.apache.org/docs/2.4.3/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader

# Default parsing of CSVs results in all columns being cast to StringType
ratings_df = spark.read.csv(
    path=ratings_csv, 
    sep=',', 
    encoding='UTF-8', 
    quote='"', 
    header=True,
)
print('ratings.csv without inferSchema (everything is StringType)')
ratings_df.printSchema()

# Let's try this one with inferSchema, and see what happens
ratings_df = spark.read.csv(
    path=ratings_csv, 
    sep=',', 
    encoding='UTF-8', 
    quote='"', 
    header=True, 
    inferSchema=True,
)
print('ratings.csv using inferSchema')
ratings_df.printSchema()
# Notice how now the numeric value columns are cast to integers and double

# This however goes a bit too far. It's probably a good idea to keep the id columns as StringType
# as to avoid any problems with regard to integer lengths or overflow errors or even potential
# whitespaces or non-numeric characters that might occur.

# So, now let's try using a so-called Schema DDL
ratings_df = spark.read.csv(
    path=ratings_csv, 
    sep=',', 
    encoding='UTF-8', 
    quote='"', 
    header=True,
    schema='userId STRING, movieId STRING, rating DOUBLE, timestamp INT', 
)
print('ratings.csv with DDL Schema Sting')
ratings_df.printSchema()

# Let's take a look at the first 20 rows of the data now, we will use the ptake() helper function for this
display_df(ratings_df, 20, False)

# Notice that the timestamp does not seem to be parsed well yet. When we look at the format of the timestamp, 
# it looks something like this: '964982703'. That is not a very human readable timestamp at all.
# This is a Unix timestamp, or 'number of seconds sinds 01-01-1970 00:00 UTC'
# Let's convert this column and replace it!

# For this we will be using the `from_unixtime` function
# More information on `from_unixtime` can be found here: https://spark.apache.org/docs/2.4.3/api/python/pyspark.sql.html#pyspark.sql.functions.from_unixtime
from pyspark.sql.functions import from_unixtime

# Additionally, we need the `col` function. This allows us to reference columns by name
# More information on `col` can be found here: https://spark.apache.org/docs/2.4.3/api/python/pyspark.sql.html#pyspark.sql.functions.col
from pyspark.sql.functions import col

# Since we are going to want to format our Timestamp in a certain format, we have to tell the system what format we want to use
# A good reference for possible formats can be found here: https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html
DATEFORMAT = "yyyy-MM-dd'T'HH:mm:ssZ"  # <-- Default ISO8601 format

# First, we make a copy of the original data column to one named `timestamp_unix`
ratings_df = ratings_df.withColumn('timestamp_unix', col('timestamp'))

# Now, we use the `from_unixtime` to transform the unix timestamp to a String with the given format
ratings_df = ratings_df.withColumn('timestamp', from_unixtime(col('timestamp'), DATEFORMAT))

# Let's see our output
ratings_df.printSchema()
display_df(ratings_df)

# Our timestamp is now converted to a string with a nice format 
# `|-- timestamp: string (nullable = true)`

# The schema shows it to still be a StringType however, lets convert it to a TimestampType
# To do this we have to use the `to_timestamp` function
# More information on the `to_timestamp` function can be found here: https://spark.apache.org/docs/2.4.3/api/python/pyspark.sql.html#pyspark.sql.functions.to_timestamp
from pyspark.sql.functions import to_timestamp

ratings_df = ratings_df.withColumn('timestamp', to_timestamp(col('timestamp'), DATEFORMAT))

# Let's see our output
ratings_df.printSchema()
display_df(ratings_df)

# Now, our data looks the way it should. The unixtime is correctly converted to a timestamp value that we can reference
# In the schema you can now see `|-- timestamp: timestamp (nullable = true)`

# For good measure, let's drop the original column since we don't need it anymore
# More information on `drop` can be found here: https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.drop
ratings_df = ratings_df.drop(col('timestamp_unix'))
display_df(ratings_df)

# We have now learned:
# - how to load csv data using the read.csv method, and how to control the settings of the method
# - how Spark DataTypes are represented and how to convert between them
# More on Spark DataTypes: https://spark.apache.org/docs/latest/sql-reference.html#data-types

AnalysisException: 'Path does not exist: file:/home/jovyan/mastering-pyspark-ml/data/MovieLens-latest-small/ratings.csv;'

In [7]:
# Locations of the csv data file
tags_csv = '../data/MovieLens-latest-small/tags.csv'

# pyspark.sql.DataFrameReader.read.csv is used for reading csv files
# more information can be found here: https://spark.apache.org/docs/2.4.3/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader
tags_df = spark.read.csv(path=tags_csv, sep=',', encoding='UTF-8', quote='"', header=True, inferSchema=True)

print('tags.csv')
ptake(tags_df)
ptake(tags_df.summary())

AnalysisException: 'Path does not exist: file:/home/jovyan/mastering-pyspark-ml/data/MovieLens-latest-small/tags.csv;'

In [None]:
csv(
    path=links_csv, 
    schema='userId STRING, movieId STRING, rating DOUBLE, timestamp INT', 
    sep=',', 
    encoding='UTF-8', 
    quote='"', 
    escape=None, 
    comment=None, 
    header=True, 
    inferSchema=True, 
    ignoreLeadingWhiteSpace=None, 
    ignoreTrailingWhiteSpace=None, 
    nullValue=None, 
    nanValue=None, 
    positiveInf=None, 
    negativeInf=None, 
    dateFormat=None, 
    timestampFormat=None, 
    maxColumns=None, 
    maxCharsPerColumn=None, 
    maxMalformedLogPerPartition=None, 
    mode=None, 
    columnNameOfCorruptRecord=None, 
    multiLine=None, 
    charToEscapeQuoteEscaping=None, 
    samplingRatio=None, 
    enforceSchema=None, 
    emptyValue=None
)