# Data Loading

In [16]:
df = spark.read.format("parquet")\
    .option("header","true")\
    .option("inferschema","true")\
    .load("s3://group23project/data_cleaning/")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Drop Columns

- Our aim is to predict delay at the time of Ticket booking and hence the following columns wont help us in prediction because customer will be unaware of the following data :- 
- 'TAXI_OUT', 'TAXI_IN', 'WHEELS_OFF', 'WHEELS_ON', 'ARR_DELAY', 'DEP_DELAY', 'ACTUAL_ELAPSED_TIME', 'DEP_TIME', 'ARR_TIME', 'AIR_TIME','DISTANCE'

In [17]:
df = df.drop('FL_DATE','TAXI_IN','TAXI_OUT','WHEELS_ON','WHEELS_OFF','ARR_DELAY','DEP_DELAY','ACTUAL_ELAPSED_TIME','DEP_TIME','ARR_TIME','AIR_TIME','DISTANCE')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
print(df.columns)
print("Number of Columns: ", len(df.columns))
print("Number of Rows:", df.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['OP_CARRIER', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME', 'MONTH', 'WEEKDAY', 'YEAR', 'FLIGHT_STATUS']
Number of Columns:  10
Number of Rows: 60431020

# Data Preprocessing

In [19]:
from pyspark.ml.feature import StringIndexer,VectorAssembler,MinMaxScaler

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# StringIndexer

- Converting Categorical Columns such as 'OP_CARRIER', 'ORIGIN', 'DEST' are converted into Indexed Columns 'OP_CARRIER_I', 'ORIGIN_I', 'DEST_I' using StringIndexer.
- StringIndexer is used to convert categorical columns to numeric.

In [20]:
indexer1 = StringIndexer(inputCol='OP_CARRIER',outputCol='OP_CARRIER_I')
strindexedDF1 = indexer1.fit(df).transform(df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
indexer2 = StringIndexer(inputCol='ORIGIN',outputCol='ORIGIN_I')
strindexedDF2 = indexer2.fit(strindexedDF1).transform(strindexedDF1)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
indexer3 = StringIndexer(inputCol='DEST',outputCol='DEST_I')
strindexedDF3 = indexer3.fit(strindexedDF2).transform(strindexedDF2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [23]:
strindexedDF3.select('OP_CARRIER','OP_CARRIER_I','ORIGIN','ORIGIN_I','DEST','DEST_I').show(5,False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------+------------+------+--------+----+------+
|OP_CARRIER    |OP_CARRIER_I|ORIGIN|ORIGIN_I|DEST|DEST_I|
+--------------+------------+------+--------+----+------+
|Delta Airlines|1.0         |SLC   |15.0    |PDX |30.0  |
|Delta Airlines|1.0         |JFK   |18.0    |MIA |24.0  |
|Delta Airlines|1.0         |ATL   |0.0     |MIA |24.0  |
|Delta Airlines|1.0         |TPA   |26.0    |DTW |10.0  |
|Delta Airlines|1.0         |ATL   |0.0     |GPT |131.0 |
+--------------+------------+------+--------+----+------+
only showing top 5 rows

In [24]:
df.columns

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['OP_CARRIER', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME', 'MONTH', 'WEEKDAY', 'YEAR', 'FLIGHT_STATUS']

# VectorAssembler

- Machine Learning models in Spark expects all features in single column. Therefore VectorAssembler combines all features and gives us vector which can be stored in single column. 
- VectorAssembler expects only Numerical Features, hence we do not take into account 'OP_CARRRIER', 'ORIGIN' and 'DEST'.

In [25]:
# Create a list of all the variables that you want to create feature vectors
# These features are then further used for training model
features_col = ['CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME', 
                'MONTH', 'WEEKDAY', 'OP_CARRIER_I', 'ORIGIN_I', 'DEST_I']

assembler = VectorAssembler(inputCols= features_col, outputCol= "features")
assembledDF = assembler.transform(strindexedDF3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
assembledDF.select("features").show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------------------------+
|features                             |
+-------------------------------------+
|[1.0,1.0,133.0,2.0,4.0,1.0,15.0,30.0]|
|[1.0,1.0,214.0,2.0,4.0,1.0,18.0,24.0]|
|[3.0,3.0,113.0,2.0,4.0,1.0,0.0,24.0] |
|[1.0,1.0,160.0,2.0,4.0,1.0,26.0,10.0]|
|[3.0,3.0,90.0,2.0,4.0,1.0,0.0,131.0] |
+-------------------------------------+
only showing top 5 rows

# MinMaxScaler

- Rescale each feature individually to a common range [min, max] linearly using column summary statistics, which is also known as min-max normalization or Rescaling.


In [27]:
# Create the MinMaxScaler object. It only take features column.
mmxscaler = MinMaxScaler(inputCol= "features", outputCol= "scaledfeatures", )

# Fit the MinMaxScaler object on the output of the Vector Assembler and transform.
mmxscaledDF = mmxscaler.fit(assembledDF).transform(assembledDF)
mmxscaledDF.select("scaledfeatures").show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------------------------------------------------------------------------------------------------------------------------+
|scaledfeatures                                                                                                                     |
+-----------------------------------------------------------------------------------------------------------------------------------+
|[0.3333333333333333,0.25,0.2839657282741738,0.09090909090909091,0.5,0.045454545454545456,0.03978779840848806,0.0797872340425532]   |
|[0.3333333333333333,0.25,0.38310893512851896,0.09090909090909091,0.5,0.045454545454545456,0.04774535809018567,0.06382978723404255] |
|[1.0,0.75,0.2594859241126071,0.09090909090909091,0.5,0.045454545454545456,0.0,0.06382978723404255]                                 |
|[0.3333333333333333,0.25,0.31701346389228885,0.09090909090909091,0.5,0.045454545454545456,0.06896551724137931,0.026595744680851064]|
|[1.0,0.75,0.23133414932680538,0.09090909090909091,0.5,0.04545

In [28]:
mmxscaledDF.select("scaledfeatures" ).show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------------------------------------------------------------------------------------------------------------------------+
|scaledfeatures                                                                                                                     |
+-----------------------------------------------------------------------------------------------------------------------------------+
|[0.3333333333333333,0.25,0.2839657282741738,0.09090909090909091,0.5,0.045454545454545456,0.03978779840848806,0.0797872340425532]   |
|[0.3333333333333333,0.25,0.38310893512851896,0.09090909090909091,0.5,0.045454545454545456,0.04774535809018567,0.06382978723404255] |
|[1.0,0.75,0.2594859241126071,0.09090909090909091,0.5,0.045454545454545456,0.0,0.06382978723404255]                                 |
|[0.3333333333333333,0.25,0.31701346389228885,0.09090909090909091,0.5,0.045454545454545456,0.06896551724137931,0.026595744680851064]|
|[1.0,0.75,0.23133414932680538,0.09090909090909091,0.5,0.04545

In [29]:
mmxscaledDF.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- OP_CARRIER: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- WEEKDAY: integer (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- FLIGHT_STATUS: integer (nullable = true)
 |-- OP_CARRIER_I: double (nullable = false)
 |-- ORIGIN_I: double (nullable = false)
 |-- DEST_I: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- scaledfeatures: vector (nullable = true)

In [30]:
mmxscaledDF.write.option("header","true").parquet("s3://group23project/data_processing/")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…