# Data Loading


In [7]:
df = spark.read.format("parquet")\
    .option("header","true")\
    .option("inferschema","true")\
    .load("s3://combinedfiles/merge/combined.parquet/*")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
df.select("*").show(5,False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+
|FL_DATE   |OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|
+----------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+
|2009-01-01|XE        |1204          

In [2]:
df.printSchema()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- FL_DATE: string (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: long (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- CRS_DEP_TIME: long (nullable = true)
 |-- DEP_TIME: double (nullable = true)
 |-- DEP_DELAY: double (nullable = true)
 |-- TAXI_OUT: double (nullable = true)
 |-- WHEELS_OFF: double (nullable = true)
 |-- WHEELS_ON: double (nullable = true)
 |-- TAXI_IN: double (nullable = true)
 |-- CRS_ARR_TIME: long (nullable = true)
 |-- ARR_TIME: double (nullable = true)
 |-- ARR_DELAY: double (nullable = true)
 |-- CANCELLED: double (nullable = true)
 |-- CANCELLATION_CODE: string (nullable = true)
 |-- DIVERTED: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- ACTUAL_ELAPSED_TIME: double (nullable = true)
 |-- AIR_TIME: double (nullable = true)
 |-- DISTANCE: double (nullable = true)
 |-- CARRIER_DELAY: double (nullable = true)
 |-- WEATHER_DELAY: double (nul

# Data Cleaning

Dropping Unnamed: 27 column because it is empty

In [8]:
df = df.drop('Unnamed: 27')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# CANCLLED

In [9]:
df.select("CANCELLED").distinct().show(50,truncate=False)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+
|CANCELLED|
+---------+
|1.0      |
|0.0      |
+---------+

Extracting flights which are not cancelled

In [10]:
from pyspark.sql.functions import col, column
df=df.where(col("CANCELLED")==0)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Droping the CANCELLED column because CANCELLED flight is not a delayed flight

In [11]:
df=df.drop(col("CANCELLED"))


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
# Number of columns
print("Number of Columns", len(df.columns))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Number of Columns 26

# CANCELLATION_CODE

In [13]:
#Since We deleted all cancelled==1 row hence we dont have any values except NULL
df.select("CANCELLATION_CODE").distinct().count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1

Dropping the CANCELLATION_CODE column since We deleted all cancelled==1 row hence we dont have any values except NULL

In [14]:
df=df.drop(col("CANCELLATION_CODE"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
print("Number of Columns", len(df.columns))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Number of Columns 25

# DIVERTED

In [16]:
df=df.where(col("DIVERTED")==0)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DIVERTED flight is one that the plane has landed in a different airport than the one scheduled, so we are taking flights which are not divrted and then dropping the DIVRTED column

In [17]:
df=df.drop(col("DIVERTED"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
print("Number of Rows: ", df.count())
print("Number of Columns", len(df.columns))


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Number of Rows:  60692142
Number of Columns 24

In [19]:
from pyspark.sql.functions import count,when,isnan

df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+
|FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|
+-------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+
|      0|         0|                0|     0|   0|           0|       0|     4725|       0|         0|        2|      1|           0|       2|     2600|     

# Dropping Some Columns 

Dropping OP_CARRIER_FL_NUM column won't be using the flight number for predictions.

Dropping CARRIER_DELAY, WEATHER_DELAY, NAS_DELAY, SECURITY_DELAY, LATE_AIRCRAFT_DELAY columns because there are a lot of values missing values in these columns so this columns are not useful for prediction.

DEP_TIME and ARR_TIME columns are not going to support for prediction of flight delay because customer is unaware of actual arrival and departure time before take off

In [20]:
df = df.drop('OP_CARRIER_FL_NUM','CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY','DEP_TIME', 'ARR_TIME')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Time-Related Columns

Time is normally a categorical and having it in the current format will give us too many columns when the hot encode is applied to them, therefore I will split the time into 4 quarters of the days meaning of 6 hours each.

In [21]:
from pyspark.sql.functions import col, column
df=df.withColumn("CRS_DEP_TIME",col("CRS_DEP_TIME")/600).withColumn("WHEELS_OFF",col("WHEELS_OFF")/600).withColumn("WHEELS_ON",col("WHEELS_ON")/600).withColumn("CRS_ARR_TIME",col("CRS_ARR_TIME")/600)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
import pyspark.sql.functions as F
df=df.withColumn("CRS_DEP_TIME",F.col("CRS_DEP_TIME").cast("int")).withColumn("WHEELS_OFF",F.col("WHEELS_OFF").cast("int")).withColumn("WHEELS_ON",F.col("WHEELS_ON").cast("int")).withColumn("CRS_ARR_TIME",F.col("CRS_ARR_TIME").cast("int"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Feature Engineering from Date Column

In [23]:
from pyspark.sql.functions import month
df = df.withColumn('MONTH',month(df.FL_DATE))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [24]:
from pyspark.sql.functions import dayofweek
df = df.withColumn('WEEKDAY',dayofweek(df.FL_DATE))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
df=df.withColumn("FL_DATE",F.col("FL_DATE").cast("string"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
from pyspark.sql.functions import count,when,isnan

df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+----------+------+----+------------+---------+--------+----------+---------+-------+------------+---------+----------------+-------------------+--------+--------+-----+-------+
|FL_DATE|OP_CARRIER|ORIGIN|DEST|CRS_DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_DELAY|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|MONTH|WEEKDAY|
+-------+----------+------+----+------------+---------+--------+----------+---------+-------+------------+---------+----------------+-------------------+--------+--------+-----+-------+
|      0|         0|     0|   0|           0|     4725|       0|         0|        2|      1|           0|     2600|               1|                  3|       2|       0|    0|      0|
+-------+----------+------+----+------------+---------+--------+----------+---------+-------+------------+---------+----------------+-------------------+--------+--------+-----+-------+

# Null Value Handling

In [27]:
df = df.na.fill(value=0,subset=["DEP_DELAY"])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
df = df.na.drop()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [29]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+----------+------+----+------------+---------+--------+----------+---------+-------+------------+---------+----------------+-------------------+--------+--------+-----+-------+
|FL_DATE|OP_CARRIER|ORIGIN|DEST|CRS_DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_DELAY|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|MONTH|WEEKDAY|
+-------+----------+------+----+------------+---------+--------+----------+---------+-------+------------+---------+----------------+-------------------+--------+--------+-----+-------+
|      0|         0|     0|   0|           0|        0|       0|         0|        0|      0|           0|        0|               0|                  0|       0|       0|    0|      0|
+-------+----------+------+----+------------+---------+--------+----------+---------+-------+------------+---------+----------------+-------------------+--------+--------+-----+-------+

# OP_CARRIER 

Changing the airline names to their extended names

In [31]:
df.select("OP_CARRIER").distinct().show(50,truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+
|OP_CARRIER|
+----------+
|AS        |
|AA        |
|B6        |
|YV        |
|US        |
|MQ        |
|9E        |
|EV        |
|OO        |
|FL        |
|WN        |
|NW        |
|F9        |
|HA        |
|XE        |
|YX        |
|CO        |
|VX        |
|UA        |
|NK        |
|DL        |
|OH        |
|G4        |
+----------+

In [32]:
carrier_name = {'UA':'United Airlines',
    'AS':'Alaska Airlines',
    '9E':'Endeavor Air',
    'B6':'JetBlue Airways',
    'EV':'ExpressJet',
    'F9':'Frontier Airlines',
    'HA':'Hawaiian Airlines',
    'MQ':'Envoy Air',
    'NK':'Spirit Airlines',
    'OO':'SkyWest Airlines',
    'VX':'Virgin America',
    'WN':'Southwest Airlines',
    'YV':'Mesa Airline',
    'YX':'Republic Airways',
    'AA':'American Airlines',
    'US':'US Airways',
    'FL':'AirTran Airways Corporation',
    'NW':'Northwest Airlines',
    'CO':'Continental Air Lines',
    'XE':'Expressjet Airlines',
    'DL':'Delta Airlines',
    'OH':'Comair Airlines',
    'G4': 'Allegiant Airlines'}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [33]:
df = df.na.replace(carrier_name, 1)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…



In [34]:
df.groupBy('OP_CARRIER').count().show(50,truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------------------+--------+
|OP_CARRIER                 |count   |
+---------------------------+--------+
|United Airlines            |4733770 |
|Continental Air Lines      |732372  |
|Republic Airways           |305251  |
|Comair Airlines            |552941  |
|Mesa Airline               |1043935 |
|Virgin America             |386960  |
|Expressjet Airlines        |1074922 |
|ExpressJet                 |4577296 |
|SkyWest Airlines           |6136768 |
|Frontier Airlines          |890983  |
|JetBlue Airways            |2446659 |
|Allegiant Airlines         |95192   |
|Northwest Airlines         |290097  |
|Endeavor Air               |1324076 |
|American Airlines          |6566001 |
|Delta Airlines             |7792523 |
|Alaska Airlines            |1653104 |
|Spirit Airlines            |576161  |
|Hawaiian Airlines          |741090  |
|AirTran Airways Corporation|1159982 |
|Envoy Air                  |3055874 |
|US Airways                 |2624103 |
|Southwest Airlines      

# Target Variable

Creating a target variable based on ARR_DELAY.
If ARR_DELAY is less than equal to 0 then FLIGHT_STATUS is 0 (Not Delayed) and if ARR_DELAY is grreater than 0 then FLIGHT_STATUS is 1 (Delayed)

In [40]:
df = df.withColumn("FLIGHT_STATUS", when(df.ARR_DELAY <= 0 ,0)
                                    .otherwise(1))


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [41]:
df.groupBy('FLIGHT_STATUS').count().show(50,truncate=False)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+--------+
|FLIGHT_STATUS|count   |
+-------------+--------+
|1            |22944477|
|0            |37745062|
+-------------+--------+

In [42]:
df.select("*").show(5,False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+-------------------+------+----+------------+---------+--------+----------+---------+-------+------------+---------+----------------+-------------------+--------+--------+-----+-------+-------------+
|FL_DATE   |OP_CARRIER         |ORIGIN|DEST|CRS_DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_DELAY|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|MONTH|WEEKDAY|FLIGHT_STATUS|
+----------+-------------------+------+----+------------+---------+--------+----------+---------+-------+------------+---------+----------------+-------------------+--------+--------+-----+-------+-------------+
|2009-01-01|Expressjet Airlines|DCA   |EWR |1           |-2.0     |18.0    |1         |1        |8.0    |2           |4.0      |62.0            |68.0               |42.0    |199.0   |1    |5      |1            |
|2009-01-01|Expressjet Airlines|EWR   |IAD |2           |-1.0     |28.0    |2         |2        |4.0    |2           |-8.0     |82.0            |75.0   

In [43]:
print("Number of Rows: ", df.count())
print("Number of Columns", len(df.columns))


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Number of Rows:  60689539
Number of Columns 19

In [46]:
df.write.option("header","true").parquet("s3://combinedfiles/Data_Cleaning/")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…