### Exploring Airport data

#### Loading the Airport codes data

In [20]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 500)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import pyspark.sql.functions as F
import pyspark.sql.types as T
from unidecode import unidecode
import re
import datetime as dt

In [2]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [62]:
df_apc =  spark.read.options(delimiter=",", header=True) \
                    .csv("Cleaned Data/I94_ports.csv")

In [63]:
df_apc.limit(5).toPandas()

Unnamed: 0,code,port,locality,province,territory
0,CLG,,Calgary,Alberta,Canada
1,EDA,,Edmonton,Alberta,Canada
2,YHC,,Hakai pass,British Columbia,Canada
3,HAL,,Halifax,Nova Scotia,Canada
4,MON,,Montreal,Quebec,Canada


#### Loading the Airport data

In [14]:
df_apd =  spark.read.options(delimiter="|", header=True, encoding="ISO-8859-1") \
                    .csv("Cleaned Data/Airports_Data/*.csv")

In [35]:
df_apd.limit(5).toPandas()

Unnamed: 0,ident,type,elevation_ft,continent,iso_country,iso_region,gps_code,iata_code,local_code,coordinates,nameL,municipalityL,nameE,municipalityE,country,state
0,00A,heliport,11,,US,US-PA,00A,,00A,"-74.93360137939453, 40.07080078125",Total Rf Heliport,Bensalem,Total Rf Heliport,Bensalem,United States,Pennsylvania
1,00AA,small_airport,3435,,US,US-KS,00AA,,00AA,"-101.473911, 38.704022",Aero B Ranch Airport,Leoti,Aero B Ranch Airport,Leoti,United States,Kansas
2,00AK,small_airport,450,,US,US-AK,00AK,,00AK,"-151.695999146, 59.94919968",Lowell Field,Anchor Point,Lowell Field,Anchor Point,United States,Alaska
3,00AL,small_airport,820,,US,US-AL,00AL,,00AL,"-86.77030181884766, 34.86479949951172",Epps Airpark,Harvest,Epps Airpark,Harvest,United States,Alabama
4,00AR,closed,237,,US,US-AR,,,,"-91.254898, 35.6087",Newport Hospital & Clinic Heliport,Newport,Newport Hospital & Clinic Heliport,Newport,United States,Arkansas


In [16]:
df_apd.filter(F.col("iso_country")=="MX").limit(5).toPandas()

Unnamed: 0,ident,type,elevation_ft,continent,iso_country,iso_region,gps_code,iata_code,local_code,coordinates,nameL,municipalityL,nameE,municipalityE,country,state
0,AMC,small_airport,71,,MX,MX-SON,MMPE,PPE,83550,"-113.305177, 31.351987",Mar de Cortés International Airport,Puerto Peñasco,Mar de Cortes International Airport,Puerto Penasco,Mexico,Sonora
1,BHL,small_airport,34,,MX,MX-BCN,,BHL,BAX,"-113.560997, 28.9786",Bahía de los Ángeles Airport,Bahía de los Ángeles,Bahia de los Angeles Airport,Bahia de los Angeles,Mexico,Baja California
2,BLM,small_airport,33,,MX,MX-BCN,,,BLM,"-113.528723717, 28.891952244799995",Bahia De Los Angelos South,,Bahia De Los Angelos South,,Mexico,Baja California
3,CYD,small_airport,575,,MX,MX-BCS,,,,"-112.8851, 27.2906",San Ignacio Downtown Airstrip,Mulegé,San Ignacio Downtown Airstrip,Mulege,Mexico,Baja California Sur
4,LOM,small_airport,6227,,MX,MX-JAL,,LOM,LMO,"-101.9441, 21.2581",Francisco Primo de Verdad y Ramos Airport,Lagos de Moreno,Francisco Primo de Verdad y Ramos Airport,Lagos de Moreno,Mexico,Jalisco


    Merging both the dataframes by the city. state, and country

In [56]:
df_apd.filter(F.col("country")=="United States").count()

22757

In [57]:
df_apc.filter(F.col("territory")=="United States").count()

538

In [64]:
cond = [df_apc.locality==df_apd.municipalityE, df_apc.province==df_apd.state, df_apc.territory==df_apd.country]
df_merged = df_apd.join(F.broadcast(df_apc), cond, "inner")

In [65]:
df_merged.limit(5).toPandas()

Unnamed: 0,ident,type,elevation_ft,continent,iso_country,iso_region,gps_code,iata_code,local_code,coordinates,nameL,municipalityL,nameE,municipalityE,country,state,code,port,locality,province,territory
0,00MT,closed,2600,,US,US-MT,,,,"-109.705002, 48.537498",Sands Ranch Airport,Havre,Sands Ranch Airport,Havre,United States,Montana,HVR,,Havre,Montana,United States
1,00MT,closed,2600,,US,US-MT,,,,"-109.705002, 48.537498",Sands Ranch Airport,Havre,Sands Ranch Airport,Havre,United States,Montana,WHM,Wild Horse Border Crossing,Havre,Montana,United States
2,00WY,heliport,5210,,US,US-WY,00WY,,00WY,"-106.224443, 42.840361",Mountain View Regional Hospital Heliport,Casper,Mountain View Regional Hospital Heliport,Casper,United States,Wyoming,CSP,,Casper,Wyoming,United States
3,01CN,heliport,300,,US,US-CA,01CN,,01CN,"-118.15399932861328, 34.03779983520508",Los Angeles County Sheriff's Department Heliport,Los Angeles,Los Angeles County Sheriff's Department Heliport,Los Angeles,United States,California,LOS,,Los Angeles,California,United States
4,01FA,small_airport,55,,US,US-FL,01FA,,01FA,"-81.14420318603516, 28.589399337768555",Rybolt Ranch Airport,Orlando,Rybolt Ranch Airport,Orlando,United States,Florida,ORL,,Orlando,Florida,United States


In [66]:
df_merged.count()

2693

In [40]:
df_merged.columns

['ident',
 'type',
 'elevation_ft',
 'continent',
 'iso_country',
 'iso_region',
 'gps_code',
 'iata_code',
 'local_code',
 'coordinates',
 'nameL',
 'municipalityL',
 'nameE',
 'municipalityE',
 'country',
 'state',
 'code',
 'port',
 'locality',
 'province',
 'territory']

In [67]:
df_merged.select("locality","province","country").distinct().count()#.limit(30).toPandas()

362

In [70]:
df_merged.filter(F.col("country")=="United States").select("locality", "province").distinct().count()

341

In [74]:
df_merged.select('ident', 'type', 'elevation_ft', 'continent', 'gps_code', 'iata_code', 'local_code', 'coordinates',\
                'nameL', 'municipalityL', 'nameE', 'locality', 'province', 'country', 'code').limit(5).toPandas()

Unnamed: 0,ident,type,elevation_ft,continent,gps_code,iata_code,local_code,coordinates,nameL,municipalityL,nameE,locality,province,country,code
0,00MT,closed,2600,,,,,"-109.705002, 48.537498",Sands Ranch Airport,Havre,Sands Ranch Airport,Havre,Montana,United States,HVR
1,00MT,closed,2600,,,,,"-109.705002, 48.537498",Sands Ranch Airport,Havre,Sands Ranch Airport,Havre,Montana,United States,WHM
2,00WY,heliport,5210,,00WY,,00WY,"-106.224443, 42.840361",Mountain View Regional Hospital Heliport,Casper,Mountain View Regional Hospital Heliport,Casper,Wyoming,United States,CSP
3,01CN,heliport,300,,01CN,,01CN,"-118.15399932861328, 34.03779983520508",Los Angeles County Sheriff's Department Heliport,Los Angeles,Los Angeles County Sheriff's Department Heliport,Los Angeles,California,United States,LOS
4,01FA,small_airport,55,,01FA,,01FA,"-81.14420318603516, 28.589399337768555",Rybolt Ranch Airport,Orlando,Rybolt Ranch Airport,Orlando,Florida,United States,ORL
