In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, lower, upper, col, trim, year, month, dayofweek, translate, concat, regexp_replace, \
date_format, date_add
from pyspark.sql.types import StringType , IntegerType
import datetime, math 
import pyspark

spark = SparkSession \
        .builder \
        .appName("Main ETL for the Capstone Project")\
        .getOrCreate()
    


In [73]:
#
#
#  ------  PORT LOCATION TABLE
#
#

textFile = spark.sparkContext.textFile("Datasets/Immigration Data/prtlCodes.json")
dfPortCodes = spark.read.json(textFile)
dfStateCode = spark.read.options(header=True, delimiter=',', inferSchema='True').csv("Datasets/States/State_Codes.csv")


nameSplit = split(dfPortCodes.name, ',')
dfPortCodes = dfPortCodes.withColumn('municipality', trim (lower( nameSplit.getItem(0) ) ) ) \
                         .withColumn('statecode', trim (upper( nameSplit.getItem(1) ) ) )

dfPortCodes = dfPortCodes.drop("name")
dfPortCodes = dfPortCodes.filter(dfPortCodes.statecode != 'None')

dfPortCodes = dfPortCodes.join(dfStateCode, dfPortCodes.statecode == dfStateCode["Alpha code"], "left")
dfPortCodes = dfPortCodes.drop("Alpha code").withColumn("State", lower(dfPortCodes.State))

dfPortCodes.createOrReplaceTempView("PORTCODES")


#dfPortCodes.show()
dfPortCodes.filter(dfPortCodes.municipality == 'toronto').toPandas()



Unnamed: 0,code,municipality,statecode,State
0,TOR,toronto,CANADA,


In [68]:
#
#
#  ------  AIRPORTS TABLE
#
#

dfAirPorts = spark.read.option("header",True).csv("Datasets/Airport Code/airport-codes_csv.csv")

# Our analysis will be set for US only.
dfAirPorts = dfAirPorts.filter( (dfAirPorts.iso_country == 'US') & (dfAirPorts.type != 'heliport') & (dfAirPorts.type != 'closed') )
latLongSplit = split(dfAirPorts.coordinates, ',')
isoRegSplit = split(dfAirPorts.iso_region, '-')

dfAirPorts = dfAirPorts.withColumn('longitude', latLongSplit.getItem(0)) \
        .withColumn('latitude', latLongSplit.getItem(1)) \
        .withColumn('statecode', upper(isoRegSplit.getItem(1))) \
        .withColumn('municipality', lower(dfAirPorts.municipality)) 

#Remove columns continent and iso_country since it is US only.
dfAirPorts = dfAirPorts.drop("iso_region", "continent", "coordinates", "gps_code", "local_code")

dfAirPorts.createOrReplaceTempView("AIRPORTS_VIEW")

dfAirPortJoinPortLocation = spark.sql("""select * from (
                                      SELECT T1.* , RANK () OVER (PARTITION BY municipality ORDER BY type ASC) as rnk1 FROM  (
                                      SELECT AIR.*, LOC.CODE FROM AIRPORTS_VIEW as AIR 
                                      JOIN PORTCODES as LOC 
                                      ON
                                      AIR.statecode = LOC.statecode
                                      AND
                                      AIR.municipality = LOC.municipality
                                      WHERE TYPE IN ('large_airport', 'medium_airport', 'small_airport')
                                      ) T1
                                      ) where rnk1 = 1
                                      """)

dfAirPortJoinPortLocation.createOrReplaceTempView("AIRPORTS_VIEW2")

dfAirPortJoinPortLocation.toPandas() #711


Unnamed: 0,ident,type,name,elevation_ft,iso_country,municipality,iata_code,longitude,latitude,statecode,CODE,rnk1
0,KCXL,small_airport,Calexico International Airport,4,US,calexico,CXL,-115.513000488,32.6694984436,CA,CAL,1
1,61CL,small_airport,Johnson Brothers Airport,-1,US,calexico,,-115.55899810791016,32.67340087890625,CA,CAL,1
2,1RL,small_airport,Point Roberts Airpark,10,US,point roberts,,-123.0790023803711,48.979698181152344,WA,PIR,1
3,0OI3,small_airport,Galloway Airport,643,US,sandusky,,-82.71610260009766,41.38209915161133,OH,SDY,1
4,KPNS,medium_airport,Pensacola Regional Airport,121,US,pensacola,PNS,-87.186599731445,30.473400115967,FL,PEN,1
...,...,...,...,...,...,...,...,...,...,...,...,...
292,KOGS,small_airport,Ogdensburg International Airport,297,US,ogdensburg,OGS,-75.46549987790002,44.6819000244,NY,OGD,1
293,KSGJ,medium_airport,Northeast Florida Regional Airport,10,US,st augustine,UST,-81.339798,29.9592,FL,SAU,1
294,NY69,small_airport,John Gonzales Field,260,US,cape vincent,,-76.30130004882812,44.12009811401367,NY,CAP,1
295,KSFO,large_airport,San Francisco International Airport,13,US,san francisco,SFO,-122.375,37.61899948120117,CA,SFR,1


In [10]:
#
#
#  ------  USATEMP  TABLE
#
#

dfTemps = spark.read.options(header=True, delimiter=',', inferSchema='True').csv("Datasets/Tempature/GlobalLandTemperaturesByState.csv")
dfTemps = dfTemps.filter( (dfTemps.Country == 'United States') & (dfTemps.AverageTemperature != 'Nan') )
dfTemps = dfTemps.withColumn('year', year(dfTemps.dt)) \
          .withColumn('month', month(dfTemps.dt))\
          .withColumn('dayOfweek', dayofweek(dfTemps.dt))\
          .withColumn('State', lower (dfTemps.State)) 
          
dfTemps = dfTemps.withColumn("id", concat(translate(dfTemps["dt"], "-" , ""), dfTemps.State ))

dfTemps = dfTemps.drop("dt", "Country")

dfTemps.createOrReplaceTempView("USATEMP")

dfTemps = spark.sql(""" SELECT * FROM USATEMP """)
    
dfTemps.printSchema()
dfTemps.toPandas()





root
 |-- AverageTemperature: double (nullable = true)
 |-- AverageTemperatureUncertainty: double (nullable = true)
 |-- State: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- dayOfweek: integer (nullable = true)
 |-- id: string (nullable = true)



Unnamed: 0,AverageTemperature,AverageTemperatureUncertainty,State,year,month,dayOfweek,id
0,10.722,2.898,alabama,1743,11,6,17431101alabama
1,19.075,2.902,alabama,1744,4,4,17440401alabama
2,21.197,2.844,alabama,1744,5,6,17440501alabama
3,25.290,2.879,alabama,1744,6,2,17440601alabama
4,26.420,2.841,alabama,1744,7,4,17440701alabama
...,...,...,...,...,...,...,...
141925,10.607,0.208,wyoming,2013,5,4,20130501wyoming
141926,16.267,0.276,wyoming,2013,6,7,20130601wyoming
141927,20.222,0.133,wyoming,2013,7,2,20130701wyoming
141928,19.621,0.217,wyoming,2013,8,5,20130801wyoming


In [11]:
#
#
#  ------  USADEMOGRAPHICS  TABLE
#
#

dfCityDemo = spark.read.options(header=True, delimiter=';').csv("Datasets/US Demographics/us-cities-demographics.csv")
dfCityDemo = dfCityDemo.withColumn('City', lower(dfCityDemo.City)) \
                        .withColumn('State', lower(dfCityDemo.State))

dfCityDemo = dfCityDemo.withColumnRenamed('City', 'city')\
                        .withColumnRenamed('State', 'state')\
                        .withColumnRenamed('Median Age', 'median_age')\
                        .withColumnRenamed('Male Population', 'male_population') \
                        .withColumnRenamed('Female Population', 'female_population') \
                        .withColumnRenamed('Total Population', 'total_population') \
                        .withColumnRenamed('Number of Veterans', 'number_of_veterans') \
                        .withColumnRenamed('Foreign-born', 'foreign_born') \
                        .withColumnRenamed('Average Household Size', 'average_household_size') \
                        .withColumnRenamed('State Code', 'state_code')\
                        .withColumnRenamed('Race', 'race')\
                        .withColumnRenamed('Count', 'count') 

dfCityDemo.createOrReplaceTempView("USADEMOGRAPHICS")
dfCityDemo = spark.sql("SELECT * FROM USADEMOGRAPHICS WHERE (state_code, city) in (select statecode, municipality from PORTCODES)")

#dfCityDemo.printSchema()

dfCityDemo.toPandas()

Unnamed: 0,city,state,median_age,male_population,female_population,total_population,number_of_veterans,foreign_born,average_household_size,state_code,race,count
0,duluth,minnesota,34.7,41271,44855,86126,5015,2258,2.25,MN,Asian,1721
1,duluth,minnesota,34.7,41271,44855,86126,5015,2258,2.25,MN,American Indian and Alaska Native,1894
2,duluth,minnesota,34.7,41271,44855,86126,5015,2258,2.25,MN,Hispanic or Latino,1981
3,duluth,minnesota,34.7,41271,44855,86126,5015,2258,2.25,MN,Black or African-American,3891
4,duluth,minnesota,34.7,41271,44855,86126,5015,2258,2.25,MN,White,81720
...,...,...,...,...,...,...,...,...,...,...,...,...
546,richmond,virginia,33.6,104793,115496,220289,12538,15741,2.29,VA,Asian,6626
547,richmond,virginia,33.6,104793,115496,220289,12538,15741,2.29,VA,Black or African-American,109722
548,richmond,virginia,33.6,104793,115496,220289,12538,15741,2.29,VA,American Indian and Alaska Native,4902
549,richmond,virginia,33.6,104793,115496,220289,12538,15741,2.29,VA,White,104568


In [72]:
#
#
#  ------  IMMIGRATION  TABLE
#
#

dfImmigration = spark.read.options(inferSchema='True').parquet("Datasets/Immigration Data/*.snappy.parquet")

dfImmigration = dfImmigration.filter(dfImmigration["i94mode"] == 1)



dfImmigration = dfImmigration \
                .withColumn("arrdate", regexp_replace(dfImmigration["arrdate"], '\..*$', '').cast(IntegerType()) ) \
                .withColumn("depdate", regexp_replace(dfImmigration["depdate"], '\..*$', '').cast(IntegerType()) )


dfImmigration.createOrReplaceTempView("dateTable")

dfImmigration = spark.sql("""
                    SELECT T1.*, 
                    day(date_add('1960-01-01', arrdate ) ) as arrday,
                    day(date_add('1960-01-01', depdate) ) as depday
                    FROM dateTable T1 
                    """)

dfImmigration = dfImmigration.drop("arrdate","depdate","i94mode","count", "admnum",  \
                                   "entdepa","entdepd","entdepu","matflag","insnum")

dfImmigration.createOrReplaceTempView("dateTable")

dfImmigration = spark.sql("""

                    SELECT * FROM (
                    SELECT count(*) AS CNT, i94port
                    FROM dateTable T1 
                    GROUP BY 
                    i94port HAVING COUNT (*) > 20000
                    ) T1
                    LEFT JOIN AIRPORTS_VIEW2 PC
                    ON 
                    T1.i94port = PC.code
                    ORDER BY CNT DESC
                    """)


#df2.printSchema()
dfImmigration.toPandas()
#df2.count()

Unnamed: 0,CNT,i94port,ident,type,name,elevation_ft,iso_country,municipality,iata_code,longitude,latitude,statecode,CODE,rnk1
0,484299,NYC,US-0883,large_airport,JFK,,US,new york,,0.0,0.0,NY,NYC,1.0
1,484299,NYC,KLGA,large_airport,La Guardia Airport,21.0,US,new york,LGA,-73.87259674,40.77719879,NY,NYC,1.0
2,484299,NYC,KJFK,large_airport,John F Kennedy International Airport,13.0,US,new york,JFK,-73.77890015,40.63980103,NY,NYC,1.0
3,340361,MIA,KMIA,large_airport,Miami International Airport,8.0,US,miami,MIA,-80.29060363769531,25.79319953918457,FL,MIA,1.0
4,308939,LOS,KLAX,large_airport,Los Angeles International Airport,125.0,US,los angeles,LAX,-118.4079971,33.94250107,CA,LOS,1.0
5,151904,SFR,KSFO,large_airport,San Francisco International Airport,13.0,US,san francisco,SFO,-122.375,37.61899948120117,CA,SFR,1.0
6,147752,ORL,KSFB,large_airport,Orlando Sanford International Airport,55.0,US,orlando,SFB,-81.23750305175781,28.7775993347168,FL,ORL,1.0
7,147752,ORL,KMCO,large_airport,Orlando International Airport,96.0,US,orlando,MCO,-81.30899810791016,28.429399490356445,FL,ORL,1.0
8,135636,NEW,,,,,,,,,,,,
9,135565,HHW,PHNL,large_airport,Daniel K Inouye International Airport,13.0,US,honolulu,HNL,-157.924228,21.32062,HI,HHW,1.0
