## 1. Upload raw data files into S3

In [None]:
!pip install pyspark

In [None]:
!pip install boto3

1.1 Imports and Configs 

In [21]:
import pandas as pd

import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import udf, col, concat_ws
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, to_timestamp, to_date

In [2]:
config = configparser.ConfigParser()
config.read('configs/global.cfg')

KEY = config.get('AWS', 'AWS_ACCESS_KEY_ID')
SECRET = config.get('AWS','AWS_SECRET_ACCESS_KEY')

input_path = config.get('PATH', 'INPUT_DATA_FOLDER')
output_path = config.get('PATH', 'OUTPUT_DATA_FOLDER')

raw_flight_data_path = input_path + config.get('PATH', 'FLIGHTS_RAW_FOLDER')
raw_tweets_data_path = input_path + config.get('PATH', 'TWEETS_RAW_FOLDER')

1.2 Create Spark Session

In [3]:
def create_spark_session():
    """
    - Create or retrieve existing spark session
    
    Returns: 
        spark -- SparkSession object 
    """
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .config("dfs.client.read.shortcircuit.skip.checksum", "true")\
        .getOrCreate()
    return spark

In [4]:
spark = create_spark_session()

sc = spark.sparkContext


1.3 Load flights df

In [None]:
flights_df = spark.read.options( 
            recursiveFileLookup=True , 
            inferSchema=True, 
            header=True)\
        .csv( raw_flight_data_path )

In [None]:
from pyspark.sql.functions import countDistinct, desc

flights_df.select("callsign").groupBy("callsign")\
    .agg( countDistinct("callsign").alias("count") )\
    .sort( desc("count") )\
    .limit(10).toPandas()

In [None]:
flights_df.limit(20).toPandas()

In [None]:
flights_staging = flights_df.selectExpr( "callsign", "icao24 as trasponder_id", 
                      "registration as aircraft_id", "typecode as aircraft_type",
                     "origin as depart_airport_id", "destination as arrival_airport_id",
                        "firstseen as depart_at", "lastseen as arrival_at")\
    .filter("arrival_airport_id is not null")

In [None]:
flights_staging.limit(10).head()

Cardinality of sample test 2021-03-12: 6.109.738

1.3.1 Enrich airport info

In [None]:
from pyspark import SparkFiles
spark.sparkContext.addFile("https://ourairports.com/data/airports.csv")

airports_df = spark.read.csv("file://" +SparkFiles.get("airports.csv"), header=True, inferSchema= True)


In [None]:
airports_staging = airports_df.selectExpr("id", "ident as code", "type", "name", "iso_country", "municipality")

Cardinality full dataset test 2021-03-10: 63.078 rows

In [None]:
airports_staging.limit(10).toPandas()

1.4 Load Tweets df

In [5]:
tweets_df = spark.read.options( 
            recursiveFileLookup=True , 
            inferSchema=True, 
            header=True)\
        .json( raw_tweets_data_path )

In [26]:
tweets_staging = tweets_df.select(['date', 'keywords', 'location.country', 'tweet_id'])\
    .withColumn("keywords", concat_ws(",", col("keywords")))\
                .filter( col("location").isNotNull() )

In [29]:
tweets_staging.dtypes

[('date', 'string'),
 ('keywords', 'string'),
 ('country', 'string'),
 ('tweet_id', 'bigint')]

Cardinality of sample test 2021-03-12 : 2.302.853 rows

2 Load to output (S3)

In [30]:
tweets_staging.write.parquet(output_path + "/tweets.parquet", mode="overwrite")

In [None]:
flights_staging.write.parquet(output_path + "/flights.parquet", mode="overwrite")

In [None]:
airports_staging.write.parquet(output_path + "/airports.parquet", mode="overwrite")

In [None]:
spark.read.parquet( output_path + "flights.parquet" ).count()

2.1 Remove .crc extension files

In [31]:
def remove_crc_files( parquet_directory ):
    directory = os.listdir(parquet_directory)

    for item in directory:
        if item.endswith(".crc"):
            os.remove(os.path.join(parquet_directory, item))

remove_crc_files( output_path + "/flights.parquet" )    
remove_crc_files( output_path + "/tweets.parquet" )    

3 Stop spark

In [None]:
spark.stop()