In [3]:
# Activate Spark in our Colab notebook.
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example: 'spark-3.2.2'
spark_version = 'spark-3.2.2'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.2.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.2.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3.2"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Waiting for headers] [Connected to cloud                                                                               Hit:2 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:6 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:9 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://ppa.launchpad.net/deadsnake

In [4]:
#import packages

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType,IntegerType


# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [5]:
#After uploading this file to your colab environment, let's create a new spark dataframe
df = spark.read.csv("/content/better_netflix_titles.csv", header=True, inferSchema=True)

In [6]:
# take a look of what we have to work with
df.show(10)

+---+-------+-------+-----+-------------+-----------------+------------+------+---------+
|_c0|show_id|   type|title|      country|       date_added|release_year|rating| duration|
+---+-------+-------+-----+-------------+-----------------+------------+------+---------+
|  0|     s1|TV Show|   3%|       Brazil|  August 14, 2020|        2020| TV-MA|4 Seasons|
|  1|     s2|  Movie| 7:19|       Mexico|December 23, 2016|        2016| TV-MA|   93 min|
|  2|     s3|  Movie|23:59|    Singapore|December 20, 2018|        2011|     R|   78 min|
|  3|     s4|  Movie|    9|United States|November 16, 2017|        2009| PG-13|   80 min|
|  4|     s5|  Movie|   21|United States|  January 1, 2020|        2008| PG-13|  123 min|
|  5|     s6|TV Show|   46|       Turkey|     July 1, 2017|        2016| TV-MA| 1 Season|
|  6|     s7|  Movie|  122|        Egypt|     June 1, 2020|        2019| TV-MA|   95 min|
|  7|     s8|  Movie|  187|United States| November 1, 2019|        1997|     R|  119 min|
|  8|     

In [7]:
# Create our temporary view
df.createOrReplaceTempView('movies')

In [8]:
# We can perform most any SQL action at this point
# here we are converting the date to a more workable date object
#NOTE: since we are not assigning this to a dataframe the change is not saved.
spark.sql("""SELECT show_id, 
   type, 
   title, 
   country, 
   TO_DATE(date_added, 'MMMM d, yyyy') 
   AS date_added, 
   release_year, 
   rating, 
   duration 
   FROM movies 
   WHERE date_added IS NOT null AND type='Movie'""").show(10)

+-------+-----+-----+-------------+----------+------------+------+--------+
|show_id| type|title|      country|date_added|release_year|rating|duration|
+-------+-----+-----+-------------+----------+------------+------+--------+
|     s2|Movie| 7:19|       Mexico|2016-12-23|        2016| TV-MA|  93 min|
|     s3|Movie|23:59|    Singapore|2018-12-20|        2011|     R|  78 min|
|     s4|Movie|    9|United States|2017-11-16|        2009| PG-13|  80 min|
|     s5|Movie|   21|United States|2020-01-01|        2008| PG-13| 123 min|
|     s7|Movie|  122|        Egypt|2020-06-01|        2019| TV-MA|  95 min|
|     s8|Movie|  187|United States|2019-11-01|        1997|     R| 119 min|
|     s9|Movie|  706|        India|2019-04-01|        2019| TV-14| 118 min|
|    s10|Movie| 1920|        India|2017-12-15|        2008| TV-MA| 143 min|
|    s11|Movie| 1922|United States|2017-10-20|        2017| TV-MA| 103 min|
|    s14|Movie|2,215|     Thailand|2019-03-01|        2018| TV-MA|  89 min|
+-------+---

In [9]:
# All of the SQL you learned in Unit 6 is available to you in Spark SQL
# Here we are listing out the counts by rating
# NOTE: it is almost NEVER a good idea to "order by" when using Spark with large datasets (more on this in 8.2)
spark.sql("""
  SELECT
    rating,
    count(*) AS number_of_ratings
  FROM movies
  GROUP BY rating
  ORDER BY 2 DESC
  """).show()

+--------+-----------------+
|  rating|number_of_ratings|
+--------+-----------------+
|   TV-MA|             2863|
|   TV-14|             1931|
|   TV-PG|              805|
|       R|              665|
|   PG-13|              386|
|    TV-Y|              280|
|   TV-Y7|              271|
|      PG|              247|
|    TV-G|              194|
|      NR|               84|
|       G|               39|
|    null|                9|
|TV-Y7-FV|                6|
|      UR|                5|
|   NC-17|                3|
+--------+-----------------+



In [10]:
# Let's output a file with just listing for children
# first we will use our spark sql to write to a dataframe

out_df= spark.sql("""
  SELECT 
  title,
  rating,
  date_added,
  duration
  FROM Movies
  WHERE rating IN ('G','PG', 'PG-13')""")

In [11]:
# make sure we got what we wanted
out_df.show()


+--------------------+------+------------------+--------+
|               title|rating|        date_added|duration|
+--------------------+------+------------------+--------+
|                   9| PG-13| November 16, 2017|  80 min|
|                  21| PG-13|   January 1, 2020| 123 min|
|            Æon Flux| PG-13|  February 1, 2018|  93 min|
|         10,000 B.C.| PG-13|      June 1, 2019| 109 min|
|           16 Blocks| PG-13|  November 1, 2019| 102 min|
|            17 Again| PG-13|   January 1, 2021| 102 min|
|20 Feet From Stardom| PG-13|September 22, 2018|  91 min|
|             28 Days| PG-13|September 30, 2020| 104 min|
|      3 Days to Kill| PG-13|  December 1, 2020| 117 min|
|       3 Generations| PG-13|   August 28, 2017|  92 min|
|            3 Idiots| PG-13|    August 1, 2019| 164 min|
|        5 Flights Up| PG-13|    March 17, 2019|  92 min|
|      50 First Dates| PG-13|  December 1, 2020|  99 min|
|        A 2nd Chance|    PG|      July 1, 2017|  95 min|
|     A Boy Ca

In [12]:
#  As Spark stores the data in partitions, it will also write data in partitions.
#  These partitions will always be stored in a folder with the same name as the file, and that folder may often contain many subfolders or files.
#  Within the partition folder, there will be a file or files that starts with `part-`, these are CSV files. 
# However, they are often not optimal for friendly reading, but can be downloaded to your computer.

out_df.write.csv('movies_out_spark.csv')

In [13]:
# The easiest work around of the part file output is to take the data to Pandas and write out a CSV.
# This forces the data to the master node and is not recommended unless you have filtered and/or aggregated your data to a reasonable size.

out_df.toPandas().to_csv('movies_out_pandas.csv')