In this activity you will be importing data from github creating Spark dataframes.  Using the two data structures, you will create views that you can join and query to answer the question:
What airport, city and state have the most departures?

In [None]:
# Activate Spark in our Colab notebook.
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example: 'spark-3.2.2'
spark_version = 'spark-3.2.2'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.2.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.2.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3.2"

# Start a SparkSession
import findspark
findspark.init()

Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:7 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:10 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:14 http://security.ubuntu.com/ubuntu 

In [None]:
# Import packages
# We are using pandas to read the raw csv files from github, then converting them to spark Dataframes (this will save us some download time and HDD space on our laptops)
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType,IntegerType
import pandas as pd

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [None]:
# URL to the RAW airport codes dataset.
url='https://raw.githubusercontent.com/databricks/LearningSparkV2/master/databricks-datasets/learning-spark-v2/flights/airport-codes-na.txt'

In [None]:
# Define the schema of the spark dataframe we intend to build from the csv.
codesSchema= StructType(  
                        [StructField("City", StringType(), True), 
                         StructField("State", StringType(), True),
                         StructField("Country", StringType(), True),
                         StructField("Iata", StringType(), True)]
                        )

airportCodes=spark.createDataFrame(pd.read_csv(url, sep='\t', error_bad_lines=False), schema=codesSchema)



  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Look at the airport code data
airportCodes.head(5)

[Row(City='Abbotsford', State='BC', Country='Canada', Iata='YXX'),
 Row(City='Aberdeen', State='SD', Country='USA', Iata='ABR'),
 Row(City='Abilene', State='TX', Country='USA', Iata='ABI'),
 Row(City='Akron', State='OH', Country='USA', Iata='CAK'),
 Row(City='Alamosa', State='CO', Country='USA', Iata='ALS')]

In [None]:
# Read in the departures dataset
urlDeparts='https://raw.githubusercontent.com/databricks/LearningSparkV2/master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv'

In [None]:
# No need to define schema here but it would load much faster if you did.  Create a spark dataframe from the given url
# Note the "date" field is actually MMDDHHmm, however if you read it without stating the dtype pandas will truncate leading 0

depart_schema=StructType(
                          [StructField("date", StringType(), True), 
                           StructField("delay", IntegerType(), True),
                           StructField("distance", IntegerType(), True),
                           StructField("origin", StringType(), True),
                           StructField("destination", StringType(), True)]
                         )

#we define the date (object) to retain the leading '0'
airport_departs=spark.createDataFrame(pd.read_csv(urlDeparts , dtype={'date': object}), schema=depart_schema).cache()

In [None]:
# Look at the departure data
airport_departs.head(5)

[Row(date='01011245', delay=6, distance=602, origin='ABE', destination='ATL'),
 Row(date='01020600', delay=-8, distance=369, origin='ABE', destination='DTW'),
 Row(date='01021245', delay=-2, distance=602, origin='ABE', destination='ATL'),
 Row(date='01020605', delay=-4, distance=602, origin='ABE', destination='ATL'),
 Row(date='01031245', delay=-4, distance=602, origin='ABE', destination='ATL')]

In [None]:
# Create a temporary view for your 'codes' dataframe
#*****Your Code Begins Here*********
airportCodes.createOrReplaceTempView('codes')

In [None]:
# Create a temporary view for your 'departures' dataframe
#*****Your Code Begins Here*********
airport_departs.createOrReplaceTempView('departures')

In [None]:
# Write a sql statement that will return the columns origin, city, state and the number of departures in order from most to least
#*****Your Code Begins Here*********
sql_depByAir="""
SELECT d.origin AS origin_Airport, c.city, c.State, count(*) AS origin_departures
FROM departures d
  JOIN
    codes c
    ON d.origin= c.Iata
GROUP BY origin_Airport, c.city, c.State
ORDER BY origin_departures DESC"""

In [None]:
# Execute the sql and save the results to a spark dataframe
#*****Your Code Begins Here*********
df_departuresByAirport = spark.sql(sql_depByAir)

In [None]:
# Show your results
#*****Your Code Begins Here*********
df_departuresByAirport.show()

+--------------+--------------+-----+-----------------+
|origin_Airport|          city|State|origin_departures|
+--------------+--------------+-----+-----------------+
|           ATL|       Atlanta|   GA|            91484|
|           DFW|        Dallas|   TX|            68482|
|           ORD|       Chicago|   IL|            64228|
|           LAX|   Los Angeles|   CA|            54086|
|           DEN|        Denver|   CO|            53148|
|           IAH|       Houston|   TX|            43361|
|           PHX|       Phoenix|   AZ|            40155|
|           SFO| San Francisco|   CA|            39483|
|           LAS|     Las Vegas|   NV|            33107|
|           CLT|     Charlotte|   NC|            28402|
|           MCO|       Orlando|   FL|            28313|
|           EWR|        Newark|   NJ|            27656|
|           SLC|Salt Lake City|   UT|            25868|
|           LGA|      New York|   NY|            25458|
|           BOS|        Boston|   MA|           