### Exploring and Cleaning the Airports Code data

Source:  [Airports Code](https://datahub.io/core/airport-codes#data)


In [1]:
%%sh

pip install unidecode
pip install pycountry

Collecting unidecode
  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
Installing collected packages: unidecode
Successfully installed unidecode-1.1.1
Collecting pycountry
  Downloading https://files.pythonhosted.org/packages/76/73/6f1a412f14f68c273feea29a6ea9b9f1e268177d32e0e69ad6790d306312/pycountry-20.7.3.tar.gz (10.1MB)
Building wheels for collected packages: pycountry
  Running setup.py bdist_wheel for pycountry: started
  Running setup.py bdist_wheel for pycountry: finished with status 'done'
  Stored in directory: /root/.cache/pip/wheels/33/4e/a6/be297e6b83567e537bed9df4a93f8590ec01c1acfbcd405348
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-20.7.3


### Handling the Latin letters

- Use latin-1 encoding to properly encode and decode the given places in airport codes data
    
    `Reason: The given encoding is hard to read`

- Use unidecode to convert the latin letters to english alphabets.
    
    `Reason: This is done inorder to match muncipalities with city/locality from airport codes data or demographics data which does not contain any latin letters.` 

This is shown with an example below:

In [2]:
s = "CorazÃ³n de JesÃºs Airport"
s.encode(encoding='latin-1',errors='strict').decode()

'Corazón de Jesús Airport'

In [3]:
from unidecode import unidecode

unidecode(s.encode(encoding='latin-1',errors='strict').decode())

'Corazon de Jesus Airport'

### Decoding the ISO_codes for the country and province

- Using the pycountry package to decoding the iso_codes to Country and province/state/territory

In [4]:
import pycountry as pyc

In [5]:
print(pyc.subdivisions.get(code='IT-62'))

Subdivision(code='IT-62', country_code='IT', name='Lazio', parent_code=None, type='Region')


In [6]:
pyc.countries.get(alpha_2='GB')

Country(alpha_2='GB', alpha_3='GBR', name='United Kingdom', numeric='826', official_name='United Kingdom of Great Britain and Northern Ireland')

In [70]:
print(pyc.subdivisions.get(code='GB-ENG'))

None


In [69]:
pyc.countries.get(alpha_3='MOS')

In [9]:
print(pyc.subdivisions.get(code='MX-BCN'))

Subdivision(code='MX-BCN', country_code='MX', name='Baja California', parent_code=None, type='State')


In [10]:
print(pyc.subdivisions.get(code='ZA-NL'))

Subdivision(code='ZA-NL', country_code='ZA', name='Kwazulu-Natal', parent_code=None, type='Province')


In [11]:
print(pyc.subdivisions.get(code='PA-1'))

Subdivision(code='PA-1', country_code='PA', name='Bocas del Toro', parent_code=None, type='Province')


In [12]:
pyc.countries.get(alpha_2='US')

Country(alpha_2='US', alpha_3='USA', name='United States', numeric='840', official_name='United States of America')

In [71]:
print(pyc.subdivisions.get(code='US-DC'))

Subdivision(code='US-DC', country_code='US', name='District of Columbia', parent_code=None, type='District')


In [14]:
pyc.countries.get(alpha_2='MP')

Country(alpha_2='MP', alpha_3='MNP', name='Northern Mariana Islands', numeric='580', official_name='Commonwealth of the Northern Mariana Islands')

### Exploring and Cleaning the port_codes data

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import pyspark.sql.functions as F
import pyspark.sql.types as T
from unidecode import unidecode
import re

In [16]:
spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()

In [17]:
#udf function to decode the Latin letters

decodeL_udf = F.udf(lambda x: x.encode(encoding='latin-1',errors='strict').decode(encoding='UTF-8',errors='replace') if x else "",T.StringType())

In [18]:
#udf function to decode the Latin letters to English Letters

decodeE_udf = F.udf(lambda x: unidecode(x) if x else "",T.StringType())

In [19]:
# .option("encoding", "ISO-8859-1")\
df = spark.read.options(delimiter=",", header="true")\
                .csv("../airport-codes_csv.csv") 

In [20]:
df = df.withColumn('nameL', decodeL_udf('name'))
df = df.withColumn('municipalityL', decodeL_udf('municipality'))
df = df.withColumn('nameE', decodeE_udf('nameL'))
df = df.withColumn('municipalityE', decodeE_udf('municipalityL'))

In [21]:
df.limit(5).toPandas()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,nameL,municipalityL,nameE,municipalityE
0,00A,heliport,Total Rf Heliport,11,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",Total Rf Heliport,Bensalem,Total Rf Heliport,Bensalem
1,00AA,small_airport,Aero B Ranch Airport,3435,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022",Aero B Ranch Airport,Leoti,Aero B Ranch Airport,Leoti
2,00AK,small_airport,Lowell Field,450,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968",Lowell Field,Anchor Point,Lowell Field,Anchor Point
3,00AL,small_airport,Epps Airpark,820,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172",Epps Airpark,Harvest,Epps Airpark,Harvest
4,00AR,closed,Newport Hospital & Clinic Heliport,237,,US,US-AR,Newport,,,,"-91.254898, 35.6087",Newport Hospital & Clinic Heliport,Newport,Newport Hospital & Clinic Heliport,Newport


In [22]:
df.select("type").distinct().toPandas()

Unnamed: 0,type
0,large_airport
1,balloonport
2,seaplane_base
3,heliport
4,closed
5,medium_airport
6,small_airport


In [23]:
df.filter((F.col("iata_code") == "LOS") & (F.col("iso_country")=="US")).count()

0

In [24]:
df.count()

55075

In [25]:
df.filter((F.col("iso_country")=="US")).count()

22757

In [26]:
df.filter(F.col("iso_region")=="BM-U-A").toPandas()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,nameL,municipalityL,nameE,municipalityE
0,BM-0001,closed,RNAS Boaz Island,,,BM,BM-U-A,Boaz Island,,,,"-64.84110260009766, 32.32080078125",RNAS Boaz Island,Boaz Island,RNAS Boaz Island,Boaz Island
1,BM-0002,closed,Naval Air Station Bermuda Annex,,,BM,BM-U-A,Morgan's Point,,,,"-64.85161590576172, 32.26987075805664",Naval Air Station Bermuda Annex,Morgan's Point,Naval Air Station Bermuda Annex,Morgan's Point
2,TXKF,medium_airport,L.F. Wade International International Airport,12.0,,BM,BM-U-A,Hamilton,TXKF,BDA,,"-64.67870330810547, 32.36399841308594",L.F. Wade International International Airport,Hamilton,L.F. Wade International International Airport,Hamilton


In [27]:
df.filter(F.col("municipality")=="Rome").toPandas()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,nameL,municipalityL,nameE,municipalityE
0,0OR6,small_airport,Rome Service Airport,3387,,US,US-OR,Rome,0OR6,,0OR6,"-117.62899780273438, 42.83399963378906",Rome Service Airport,Rome,Rome Service Airport,Rome
1,5NY4,small_airport,Stanwix Heights Airport,610,,US,US-NY,Rome,5NY4,,5NY4,"-75.43070220947266, 43.15760040283203",Stanwix Heights Airport,Rome,Stanwix Heights Airport,Rome
2,GA77,closed,Wallace Field,810,,US,US-GA,Rome,,,,"-85.112197, 34.135798",Wallace Field,Rome,Wallace Field,Rome
3,IT-0061,heliport,Capitalia Heliport,116,EU,IT,IT-62,Rome,LIAD,,,"12.381729, 41.824691",Capitalia Heliport,Rome,Capitalia Heliport,Rome
4,KK16,small_airport,Becks Grove Airport,450,,US,US-NY,Rome,KK16,,K16,"-75.60379791259766, 43.258399963378906",Becks Grove Airport,Rome,Becks Grove Airport,Rome
5,KREO,small_airport,Rome State Airport,4053,,US,US-OR,Rome,KREO,REO,REO,"-117.885002136, 42.5777015686",Rome State Airport,Rome,Rome State Airport,Rome
6,KRME,medium_airport,Griffiss International Airport,504,,US,US-NY,Rome,KRME,RME,RME,"-75.40699768, 43.23379898",Griffiss International Airport,Rome,Griffiss International Airport,Rome
7,KRMG,medium_airport,Richard B Russell Airport,644,,US,US-GA,Rome,KRMG,RMG,RMG,"-85.15799713130001, 34.3506011963",Richard B Russell Airport,Rome,Richard B Russell Airport,Rome
8,LIRA,large_airport,CiampinoâG. B. Pastine International Airport,427,EU,IT,IT-62,Rome,LIRA,CIA,RM12,"12.5949, 41.7994",Ciampino–G. B. Pastine International Airport,Rome,Ciampino-G. B. Pastine International Airport,Rome
9,LIRC,heliport,Centocelle Heliport,158,EU,IT,IT-62,Rome,LIRC,,,"12.5637, 41.8729",Centocelle Heliport,Rome,Centocelle Heliport,Rome


### Exploring the Latin letters and corresponding English letters

Latin letters in: `nameL, municipalityL`

Corresponding English letters in: `nameE, municipalityE`

In [28]:
df.filter(F.col("iso_country")=="MX").limit(5).toPandas()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,nameL,municipalityL,nameE,municipalityE
0,AMC,small_airport,Mar de CortÃ©s International Airport,71,,MX,MX-SON,Puerto PeÃ±asco,MMPE,PPE,83550,"-113.305177, 31.351987",Mar de Cortés International Airport,Puerto Peñasco,Mar de Cortes International Airport,Puerto Penasco
1,BHL,small_airport,BahÃ­a de los Ãngeles Airport,34,,MX,MX-BCN,BahÃ­a de los Ãngeles,,BHL,BAX,"-113.560997, 28.9786",Bahía de los Ángeles Airport,Bahía de los Ángeles,Bahia de los Angeles Airport,Bahia de los Angeles
2,BLM,small_airport,Bahia De Los Angelos South,33,,MX,MX-BCN,,,,BLM,"-113.528723717, 28.891952244799995",Bahia De Los Angelos South,,Bahia De Los Angelos South,
3,CYD,small_airport,San Ignacio Downtown Airstrip,575,,MX,MX-BCS,MulegÃ©,,,,"-112.8851, 27.2906",San Ignacio Downtown Airstrip,Mulegé,San Ignacio Downtown Airstrip,Mulege
4,LOM,small_airport,Francisco Primo de Verdad y Ramos Airport,6227,,MX,MX-JAL,Lagos de Moreno,,LOM,LMO,"-101.9441, 21.2581",Francisco Primo de Verdad y Ramos Airport,Lagos de Moreno,Francisco Primo de Verdad y Ramos Airport,Lagos de Moreno


### Exploring for "St" Occurences in muncipality


In [31]:
filterSt = F.udf(lambda x: True if x and re.search("St ", x) else False,T.BooleanType()) # re.search("?", x)

In [32]:
df.filter(filterSt("municipality")).limit(5).toPandas()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,nameL,municipalityL,nameE,municipalityE
0,02KS,small_airport,Jmj Landing Airport,1170,,US,US-KS,St Marys,02KS,,02KS,"-96.0552978515625, 39.222198486328125",Jmj Landing Airport,St Marys,Jmj Landing Airport,St Marys
1,06LS,heliport,Tembec Heliport,85,,US,US-LA,St Francisville,06LS,,06LS,"-91.32019805908203, 30.70789909362793",Tembec Heliport,St Francisville,Tembec Heliport,St Francisville
2,09AZ,small_airport,Stronghold Airport,4970,,US,US-AZ,St David,09AZ,,09AZ,"-110.03800201416016, 31.92530059814453",Stronghold Airport,St David,Stronghold Airport,St David
3,09LS,heliport,West Feliciana Sheriff's Office Heliport,181,,US,US-LA,St Francisville,09LS,,09LS,"-91.38459777832031, 30.824399948120117",West Feliciana Sheriff's Office Heliport,St Francisville,West Feliciana Sheriff's Office Heliport,St Francisville
4,0LL5,small_airport,Busboom RLA Restricted Landing Area,675,,US,US-IL,St Joseph,0LL5,,0LL5,"-88.07839965820001, 40.1100006104",Busboom RLA Restricted Landing Area,St Joseph,Busboom RLA Restricted Landing Area,St Joseph


### Verifying "Ident" column is unique

In [33]:
df.count()

55075

In [34]:
df.select("ident").distinct().count()

55075

### Obtaining countries and states explansion from the iso_country and iso_code using pycountry package

In [35]:
country_udf = F.udf(lambda x: pyc.countries.get(alpha_2=x).name if pyc.countries.get(alpha_2=x)!=None else x,T.StringType())

In [36]:
state_udf = F.udf(lambda x: pyc.subdivisions.get(code=x).name if pyc.subdivisions.get(code=x)!=None else "",T.StringType())

#### Testing on small data

In [37]:
from pyspark.sql import Row
l = [('Ankit','US', 'US-IL'),('Jalfaizy','MX', 'MX-JAL'),('saurabh','IT', 'IT-62'),('Bala','BM','BM-U-A')]
rdd = spark.sparkContext.parallelize(l)
people = rdd.map(lambda x: Row(name=x[0], iso_country=x[1], iso_code=x[2]))
schemaPeople = spark.createDataFrame(people)

In [38]:
schemaPeople.show()

+--------+-----------+--------+
|iso_code|iso_country|    name|
+--------+-----------+--------+
|   US-IL|         US|   Ankit|
|  MX-JAL|         MX|Jalfaizy|
|   IT-62|         IT| saurabh|
|  BM-U-A|         BM|    Bala|
+--------+-----------+--------+



In [39]:
schemaPeople = schemaPeople.withColumn('state', state_udf('iso_code'))
schemaPeople = schemaPeople.withColumn('country', country_udf('iso_country'))

In [40]:
schemaPeople.show()

+--------+-----------+--------+--------+-------------+
|iso_code|iso_country|    name|   state|      country|
+--------+-----------+--------+--------+-------------+
|   US-IL|         US|   Ankit|Illinois|United States|
|  MX-JAL|         MX|Jalfaizy| Jalisco|       Mexico|
|   IT-62|         IT| saurabh|   Lazio|        Italy|
|  BM-U-A|         BM|    Bala|        |      Bermuda|
+--------+-----------+--------+--------+-------------+



#### Implementing it on whole data

In [41]:
df = df.withColumn('country', country_udf('iso_country'))

In [42]:
df.limit(5).toPandas()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,nameL,municipalityL,nameE,municipalityE,country
0,00A,heliport,Total Rf Heliport,11,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",Total Rf Heliport,Bensalem,Total Rf Heliport,Bensalem,United States
1,00AA,small_airport,Aero B Ranch Airport,3435,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022",Aero B Ranch Airport,Leoti,Aero B Ranch Airport,Leoti,United States
2,00AK,small_airport,Lowell Field,450,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968",Lowell Field,Anchor Point,Lowell Field,Anchor Point,United States
3,00AL,small_airport,Epps Airpark,820,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172",Epps Airpark,Harvest,Epps Airpark,Harvest,United States
4,00AR,closed,Newport Hospital & Clinic Heliport,237,,US,US-AR,Newport,,,,"-91.254898, 35.6087",Newport Hospital & Clinic Heliport,Newport,Newport Hospital & Clinic Heliport,Newport,United States


In [43]:
df = df.withColumn('state', state_udf('iso_region'))

In [44]:
df.limit(5).toPandas()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,nameL,municipalityL,nameE,municipalityE,country,state
0,00A,heliport,Total Rf Heliport,11,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",Total Rf Heliport,Bensalem,Total Rf Heliport,Bensalem,United States,Pennsylvania
1,00AA,small_airport,Aero B Ranch Airport,3435,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022",Aero B Ranch Airport,Leoti,Aero B Ranch Airport,Leoti,United States,Kansas
2,00AK,small_airport,Lowell Field,450,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968",Lowell Field,Anchor Point,Lowell Field,Anchor Point,United States,Alaska
3,00AL,small_airport,Epps Airpark,820,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172",Epps Airpark,Harvest,Epps Airpark,Harvest,United States,Alabama
4,00AR,closed,Newport Hospital & Clinic Heliport,237,,US,US-AR,Newport,,,,"-91.254898, 35.6087",Newport Hospital & Clinic Heliport,Newport,Newport Hospital & Clinic Heliport,Newport,United States,Arkansas


In [45]:
df.select("state").distinct().count()

2442

In [146]:
#### Saving the dataframe as the parquet file
#### Removing latin encoded columns
# df = df.drop('nameL')
# df = df.drop('municipalityL')

In [54]:
df = df.drop('name')
df = df.drop('municipality')

In [55]:
df.printSchema()

root
 |-- ident: string (nullable = true)
 |-- type: string (nullable = true)
 |-- elevation_ft: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- iso_country: string (nullable = true)
 |-- iso_region: string (nullable = true)
 |-- gps_code: string (nullable = true)
 |-- iata_code: string (nullable = true)
 |-- local_code: string (nullable = true)
 |-- coordinates: string (nullable = true)
 |-- nameL: string (nullable = true)
 |-- municipalityL: string (nullable = true)
 |-- nameE: string (nullable = true)
 |-- municipalityE: string (nullable = true)
 |-- country: string (nullable = true)
 |-- state: string (nullable = true)



In [56]:
df.write\
    .option("encoding", "ISO-8859-1")\
    .option("sep","|")\
    .option("header", True)\
    .csv('../Cleaned Data/Airports_Data')

### Reading the saved file to Check how it is stored

In [57]:
df_r = spark.read.options(delimiter="|", header="true", encoding="ISO-8859-1")\
                .csv(path="../Cleaned Data/Airports_Data/")

In [58]:
df_r.limit(5).toPandas()

Unnamed: 0,ident,type,elevation_ft,continent,iso_country,iso_region,gps_code,iata_code,local_code,coordinates,nameL,municipalityL,nameE,municipalityE,country,state
0,00A,heliport,11,,US,US-PA,00A,,00A,"-74.93360137939453, 40.07080078125",Total Rf Heliport,Bensalem,Total Rf Heliport,Bensalem,United States,Pennsylvania
1,00AA,small_airport,3435,,US,US-KS,00AA,,00AA,"-101.473911, 38.704022",Aero B Ranch Airport,Leoti,Aero B Ranch Airport,Leoti,United States,Kansas
2,00AK,small_airport,450,,US,US-AK,00AK,,00AK,"-151.695999146, 59.94919968",Lowell Field,Anchor Point,Lowell Field,Anchor Point,United States,Alaska
3,00AL,small_airport,820,,US,US-AL,00AL,,00AL,"-86.77030181884766, 34.86479949951172",Epps Airpark,Harvest,Epps Airpark,Harvest,United States,Alabama
4,00AR,closed,237,,US,US-AR,,,,"-91.254898, 35.6087",Newport Hospital & Clinic Heliport,Newport,Newport Hospital & Clinic Heliport,Newport,United States,Arkansas


In [59]:
df_r.filter(F.col("iso_country")=="MX").limit(5).toPandas()

Unnamed: 0,ident,type,elevation_ft,continent,iso_country,iso_region,gps_code,iata_code,local_code,coordinates,nameL,municipalityL,nameE,municipalityE,country,state
0,AMC,small_airport,71,,MX,MX-SON,MMPE,PPE,83550,"-113.305177, 31.351987",Mar de Cortés International Airport,Puerto Peñasco,Mar de Cortes International Airport,Puerto Penasco,Mexico,Sonora
1,BHL,small_airport,34,,MX,MX-BCN,,BHL,BAX,"-113.560997, 28.9786",Bahía de los Ángeles Airport,Bahía de los Ángeles,Bahia de los Angeles Airport,Bahia de los Angeles,Mexico,Baja California
2,BLM,small_airport,33,,MX,MX-BCN,,,BLM,"-113.528723717, 28.891952244799995",Bahia De Los Angelos South,,Bahia De Los Angelos South,,Mexico,Baja California
3,CYD,small_airport,575,,MX,MX-BCS,,,,"-112.8851, 27.2906",San Ignacio Downtown Airstrip,Mulegé,San Ignacio Downtown Airstrip,Mulege,Mexico,Baja California Sur
4,LOM,small_airport,6227,,MX,MX-JAL,,LOM,LMO,"-101.9441, 21.2581",Francisco Primo de Verdad y Ramos Airport,Lagos de Moreno,Francisco Primo de Verdad y Ramos Airport,Lagos de Moreno,Mexico,Jalisco
