## 1. Upload raw data files into S3

In [None]:
!pip install pyspark

In [None]:
!pip install boto3

1.1 Imports and Configs 

In [1]:
import pandas as pd

import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, to_timestamp, to_date

In [2]:
config = configparser.ConfigParser()
config.read('configs/global.cfg')

KEY = config.get('AWS', 'AWS_ACCESS_KEY_ID')
SECRET = config.get('AWS','AWS_SECRET_ACCESS_KEY')

input_path = config.get('PATH', 'INPUT_DATA_FOLDER')
output_path = config.get('PATH', 'OUTPUT_DATA_FOLDER')

raw_flight_data_path = input_path + config.get('PATH', 'FLIGHTS_RAW_FOLDER')
raw_tweets_data_path = input_path + config.get('PATH', 'TWEETS_RAW_FOLDER')

1.2 Create Spark Session

In [3]:
def create_spark_session():
    """
    - Create or retrieve existing spark session
    
    Returns: 
        spark -- SparkSession object 
    """
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

In [4]:
spark = create_spark_session()

sc = spark.sparkContext

1.3 Load flights df

In [5]:
flights_df = spark.read.options( 
            recursiveFileLookup=True , 
            inferSchema=True, 
            header=True)\
        .csv( raw_flight_data_path )

In [None]:
from pyspark.sql.functions import countDistinct, desc

flights_df.select("callsign").groupBy("callsign")\
    .agg( countDistinct("callsign").alias("count") )\
    .sort( desc("count") )\
    .limit(10).toPandas()

In [7]:
flights_df.limit(20).toPandas()

Unnamed: 0,callsign,number,icao24,registration,typecode,origin,destination,firstseen,lastseen,day,latitude_1,longitude_1,altitude_1,latitude_2,longitude_2,altitude_2
0,ADB3280,,50801d,UR-82009,A124,OMDW,NZAR,2020-09-30 00:08:47+00:00,2020-10-01 07:53:50+00:00,2020-10-01 00:00:00+00:00,24.888084,55.170504,0.0,-36.976273,174.913389,441.96
1,ETH3714,,040159,,,,EBBR,2020-09-30 02:43:27+00:00,2020-10-01 09:03:50+00:00,2020-10-01 00:00:00+00:00,8.278519,76.970727,10363.2,50.904236,4.474034,144.78
2,LAN9570,,e80459,CC-BGM,B789,KMIA,KLAX,2020-09-30 07:04:12+00:00,2020-10-01 16:05:05+00:00,2020-10-01 00:00:00+00:00,25.785965,-80.321147,0.0,33.939011,-118.366587,114.3
3,ABW9514,,424970,VQ-BGZ,B748,,EDDP,2020-09-30 07:06:25+00:00,2020-10-01 01:34:29+00:00,2020-10-01 00:00:00+00:00,35.850204,140.468153,3657.6,51.412628,12.22615,160.02
4,CCA849,,781346,,,WSSS,EGLL,2020-09-30 08:06:51+00:00,2020-10-01 08:29:13+00:00,2020-10-01 00:00:00+00:00,1.306686,103.976751,609.6,51.477722,-0.42817,190.5
5,CSN311,CZ311,78168f,,,,CYYZ,2020-09-30 08:24:25+00:00,2020-10-01 02:40:27+00:00,2020-10-01 00:00:00+00:00,29.797622,104.763794,11277.6,43.68576,-79.647249,
6,SIA38,SQ38,76cdb7,,,WSSS,KLAX,2020-09-30 09:22:16+00:00,2020-10-01 00:33:57+00:00,2020-10-01 00:00:00+00:00,1.305232,103.974373,609.6,33.953751,-118.386037,144.78
7,CPA826,CX826,789245,,,VHHH,CYYZ,2020-09-30 09:35:33+00:00,2020-10-01 00:05:13+00:00,2020-10-01 00:00:00+00:00,22.300369,113.910894,0.0,43.678719,-79.620405,266.7
8,HBAL127,,a210ea,,,,,2020-09-30 09:42:33+00:00,2020-10-01 00:26:10+00:00,2020-10-01 00:00:00+00:00,-8.34947,-74.897493,19202.4,-6.017249,-74.706652,18288.0
9,FDX5030,,ac3fc5,,,YSSY,EDDK,2020-09-30 09:54:58+00:00,2020-10-01 08:51:03+00:00,2020-10-01 00:00:00+00:00,-33.928528,151.171314,0.0,50.880844,7.128642,167.64


In [64]:
flights_staging = flights_df.selectExpr( "callsign", "icao24 as trasponder_id", 
                      "registration as aircraft_id", "typecode as aircraft_type",
                     "origin as depart_airport_id", "destination as arrival_airport_id",
                        "firstseen as depart_at", "lastseen as arrival_at")\
    .filter("arrival_airport_id is not null")

Cardinality of sample test 2021-03-12: 6.109.738

1.3.1 Enrich airport info

In [8]:
from pyspark import SparkFiles
spark.sparkContext.addFile("https://ourairports.com/data/airports.csv")

airports_df = spark.read.csv("file://" +SparkFiles.get("airports.csv"), header=True, inferSchema= True)


In [11]:
airports_staging = airports_df.selectExpr("id", "ident as code", "type", "name", "iso_country", "municipality")

Cardinality full dataset test 2021-03-10: 63.078 rows

In [13]:
airports_staging.limit(10).toPandas()

Unnamed: 0,id,code,type,name,iso_country,municipality
0,6523,00A,heliport,Total Rf Heliport,US,Bensalem
1,323361,00AA,small_airport,Aero B Ranch Airport,US,Leoti
2,6524,00AK,small_airport,Lowell Field,US,Anchor Point
3,6525,00AL,small_airport,Epps Airpark,US,Harvest
4,6526,00AR,closed,Newport Hospital & Clinic Heliport,US,Newport
5,322127,00AS,small_airport,Fulton Airport,US,Alex
6,6527,00AZ,small_airport,Cordes Airport,US,Cordes
7,6528,00CA,small_airport,Goldstone /Gts/ Airport,US,Barstow
8,324424,00CL,small_airport,Williams Ag Airport,US,Biggs
9,322658,00CN,heliport,Kitchen Creek Helibase Heliport,US,Pine Valley


1.4 Load Tweets df

In [67]:
tweets_df = spark.read.options( 
            recursiveFileLookup=True , 
            inferSchema=True, 
            header=True)\
        .json( raw_tweets_data_path )

In [None]:
tweets_df.dtypes

In [106]:
tweets_staging = tweets_df.select(['date', 'keywords', 'location.country', 'tweet_id'])\
    .filter( col("location").isNotNull() )

2302853

Cardinality of sample test 2021-03-12 : 2.302.853 rows

2 Load to output (S3)

In [110]:
tweets_staging.write.parquet(output_path + "tweets.parquet", mode="overwrite")

In [114]:
flights_staging.write.parquet(output_path + "flights.parquet", mode="overwrite")

In [113]:
airports_staging.write.parquet(output_path + "airports.parquet", mode="overwrite")

In [116]:
spark.read.parquet( output_path + "flights.parquet" ).count()

6109738

3 Stop spark

In [None]:
spark.stop()