# Data Pre Processing and Cleaning

In [1]:
from pyspark.sql.types import *

In [2]:
#create spark session
spark = SparkSession.builder.appName('Data Preprocessing and Cleaning').config('spark.some.config.option','some-value').getOrCreate()

In [3]:
#Create Schema
flightSchema = StructType([
StructField("DayofMonth",IntegerType(),False),
StructField("DayofWeek",IntegerType(),False),
StructField("Carrier",StringType(),False),
StructField("OrginAirportID",IntegerType(),False),
StructField("DestinationAirportID",IntegerType(),False),
StructField("DepDelay",IntegerType(),False),
StructField("ArrivalDelay",IntegerType(),False),
])


In [4]:
#Load the data and check the resut of DF

flights = spark.read.csv("dataset/raw-flight-data.csv",schema=flightSchema,header=True)
flights.show(3)

+----------+---------+-------+--------------+--------------------+--------+------------+
|DayofMonth|DayofWeek|Carrier|OrginAirportID|DestinationAirportID|DepDelay|ArrivalDelay|
+----------+---------+-------+--------------+--------------------+--------+------------+
|        19|        5|     DL|         11433|               13303|      -3|           1|
|        19|        5|     DL|         14869|               12478|       0|          -8|
|        19|        5|     DL|         14057|               14869|      -4|         -15|
+----------+---------+-------+--------------+--------------------+--------+------------+
only showing top 3 rows



In [5]:
#Create the airport schema

airportSchema = StructType([
StructField("airport_id",IntegerType(),False),
StructField("city",StringType(),False),
StructField("state",StringType(),False),
StructField("name",StringType(),False)
])


In [6]:
#load the data and the display the result

airports = spark.read.csv("dataset/airports.csv",header=True,inferSchema=True)
airports.show(5)


+----------+-----------+-----+--------------------+
|airport_id|       city|state|                name|
+----------+-----------+-----+--------------------+
|     10165|Adak Island|   AK|                Adak|
|     10299|  Anchorage|   AK|Ted Stevens Ancho...|
|     10304|      Aniak|   AK|       Aniak Airport|
|     10754|     Barrow|   AK|Wiley Post/Will R...|
|     10551|     Bethel|   AK|      Bethel Airport|
+----------+-----------+-----+--------------------+
only showing top 5 rows



In [7]:
#Merge two dataframe (flight & Airport) and show how many flights from each city

flightByOrgin = flights.join(airports,flights.OrginAirportID == airports.airport_id).groupBy("City").count()

flightByOrgin.show(5)


+--------------+-----+
|          City|count|
+--------------+-----+
|       Phoenix|13590|
|         Omaha| 2689|
|Raleigh/Durham| 5404|
|     Anchorage|  821|
|        Dallas| 3051|
+--------------+-----+
only showing top 5 rows



# Handle Duplicated Data
### Drop duplicated data and calculate duplicate data count

In [8]:
#Count the number of original data rows

n1 = flights.count()

print("Number of original data rows :",n1)

#count the number of data rows after deleting duplicate data

n2 = flights.dropDuplicates().count()

print("Number of data after deleting duplicate data :",n2)

n3 = n1-n2

print("Number of Duplicate data: ", n3)

Number of original data rows : 487214
Number of data after deleting duplicate data : 483190
Number of Duplicate data:  4024


# Handle missing Data                                                                                      


In [9]:
#Delete row if there at least one column missing data
#use how = "all" for all columns missing

flightNoMissingValue = flights.dropDuplicates().dropna(how='any',subset=["ArrivalDelay","DepDelay"]) 

numberOfMissingValueAny = n1 -flightNoMissingValue.count()

print("Number of missing value rows :",numberOfMissingValueAny)



Number of missing value rows : 9104


# Fill the missing data using mean value of each corresponding column data

In [10]:
#Take Mean Value

meanArrDelay = flights.groupBy().avg("ArrivalDelay").take(1)[0][0]

print ("Mean Arrival Delay",meanArrDelay)

meanDepDelay = flights.groupBy().avg("DepDelay").take(1)[0][0]

print("Mean of Depart Delay",meanDepDelay)


Mean Arrival Delay 6.958720463349381
Mean of Depart Delay 10.67275056035863


In [11]:
#Drop duplicated data and fill missing data with mean value

flightsCleanData = flights.fillna({'ArrivalDelay':meanArrDelay,'DepDelay':meanDepDelay})

#test result

flights.groupBy().avg("ArrivalDelay").show()

+-----------------+
|avg(ArrivalDelay)|
+-----------------+
|6.958720463349381|
+-----------------+



In [12]:
#Explore the statistics of the data

flightsCleanData.describe('ArrivalDelay','DepDelay').show()


+-------+------------------+------------------+
|summary|      ArrivalDelay|          DepDelay|
+-------+------------------+------------------+
|  count|            487214|            487214|
|   mean| 6.945851309691429|10.664090112353094|
| stddev|40.460616776738185| 37.70614537992643|
|    min|               -75|               -60|
|    max|              1440|              1425|
+-------+------------------+------------------+



In [13]:
#Finding the correlation between two variables to know wheather the variable is related each other or not

correlationData = flightsCleanData.corr('ArrivalDelay','DepDelay' )

print('Correlation between Depart Delay and arrival delay: ',correlationData)

Correlation between Depart Delay and arrival delay:  0.9417700257369398
