In [None]:
from pyspark.sql.session import SparkSession
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import expr
from pyspark.ml.stat import Summarizer
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from helpers.helper_functions import translate_to_file_string

In [None]:
inputFile = translate_to_file_string("../data/Flight_Delay_Jan_2020_ontime.csv")

In [None]:
spark = (SparkSession
       .builder
       .appName("FlightDataStatistics")
       .getOrCreate())

In [None]:
pysparkDF = spark.read.option("header", "true") \
        .option("inferSchema", "true") \
        .option("delimiter", ",") \
        .csv(inputFile) \
        .withColumn("DIVERTED_BOOL", expr("DIVERTED").cast(BooleanType())) \
        .withColumn("CANCELLED_BOOL", expr("CANCELLED").cast(BooleanType())) \
        .withColumn("DEP_DEL15_BOOL", expr("DEP_DEL15").cast(BooleanType())) \
        .withColumn("ARR_DEL15_BOOL", expr("ARR_DEL15").cast(BooleanType())) \
        
pysparkDF.printSchema()

In [None]:
# DATA UNDERSTANDING!
# Check whether OP_CARRIER_FL_NUM is merely running id for flights or rather encoding specific trims (e.g. Istanbul -> New York)
pysparkDF.groupby('OP_CARRIER_FL_NUM').count().show()

In [None]:
# DATA UNDERSTANDING!
# A tail number refers to an identification number painted on an aircraft, frequently on the tail.
# Check amount of flights per plane
pysparkDF.groupby('TAIL_NUM').count().show()

# Check average flights per plane per year
pysparkDF.groupby('TAIL_NUM').count().agg(F.mean('count')).show()

In [None]:
# DATA UNDERSTANDING!
# Check whether ORIGIN_AIRPORT_ID is 1:1 mapping onto ORIGIN
pysparkDF.groupby(['ORIGIN_AIRPORT_ID','ORIGIN']).count().count() == pysparkDF.groupby(['ORIGIN_AIRPORT_ID','ORIGIN']).count().dropDuplicates(['ORIGIN_AIRPORT_ID']).count()

# -> ORIGIN_AIRPORT_ID is string indexing ORIGIN

In [None]:
# DATA UNDERSTANDING!
# Check whether ORIGIN_AIRPORT_ID is 1:1 mapping onto ORIGIN_AIRPORT_SEQ_ID
pysparkDF.groupby(['ORIGIN_AIRPORT_ID','ORIGIN_AIRPORT_SEQ_ID']).count().count() == pysparkDF.groupby(['ORIGIN_AIRPORT_ID','ORIGIN_AIRPORT_SEQ_ID']).count().dropDuplicates(['ORIGIN_AIRPORT_ID']).count()

# -> ORIGIN_AIRPORT_ID is 1:1 mapping to ORIGIN_AIRPORT_SEQ_ID

In [None]:
# DATA UNDERSTANDING!
# Check whether DEST_AIRPORT_ID is 1:1 mapping onto DEST
pysparkDF.groupby(['DEST_AIRPORT_ID','DEST']).count().count() == pysparkDF.groupby(['DEST_AIRPORT_ID','DEST']).count().dropDuplicates(['DEST_AIRPORT_ID']).count()

# -> DEST_AIRPORT_ID is string indexing DEST

In [None]:
# DATA UNDERSTANDING!
# Check whether DEST_AIRPORT_ID is 1:1 mapping onto DEST_AIRPORT_SEQ_ID
pysparkDF.groupby(['DEST_AIRPORT_ID','DEST_AIRPORT_SEQ_ID']).count().count() == pysparkDF.groupby(['DEST_AIRPORT_ID','DEST_AIRPORT_SEQ_ID']).count().dropDuplicates(['DEST_AIRPORT_SEQ_ID']).count()

# -> DEST_AIRPORT_ID is 1:1 mapping to ORIGIN_AIRPORT_SEQ_ID

In [None]:
# DATA UNDERSTANDING!
# Check whether OP_UNIQUE_CARRIER is 1:1 mapping onto OP_CARRIER
pysparkDF.groupby(['OP_UNIQUE_CARRIER','OP_CARRIER']).count().count() == pysparkDF.groupby(['OP_UNIQUE_CARRIER','OP_CARRIER']).count().dropDuplicates(['OP_UNIQUE_CARRIER']).count()

# -> OP_UNIQUE_CARRIER is 1:1 mapping to OP_CARRIER

In [None]:
# DATA UNDERSTANDING!
# Check whether TAIL_NUM is 1:1 mapping onto OP_CARRIER_FL_NUM
pysparkDF.groupby(['TAIL_NUM','OP_CARRIER_FL_NUM']).count().count() == pysparkDF.groupby(['TAIL_NUM','OP_CARRIER_FL_NUM']).count().dropDuplicates(['OP_CARRIER_FL_NUM']).count()

# -> TAIL_NUM is not 1:1 mapping to OP_CARRIER_FL_NUM -> One distinct plane can fly multiple routes

### Remove faulty features
Bei der Spalte "_c21" handelt es sich um eine leere Spalte. Die Spalte enthält keine Daten und kann somit entfernt werden.

In [None]:
pysparkDF = pysparkDF.drop('_c21')
pysparkDF.printSchema()

### Remove records containing NULL values
Der Datensatz enthält Felder mit NULL Werten. Diese werden für die Auswertung enfernt.

In [None]:
pysparkDF_nonull = pysparkDF.dropna()
f"Removed {pysparkDF.count()-pysparkDF_nonull.count()} records containing NULL values"

In [None]:
#spark.stop()