# Data Loading

In [2]:
df = spark.read.format("parquet")\
    .option("header","true")\
    .option("inferschema","true")\
    .load("s3://vitaproject23/cleandata/")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Drop Columns

Our aim is to predict delay at the time of Ticket booking and hence the following columns wont help us in prediction because customer will be unaware of the following data :- 'TAXI_OUT', 'TAXI_IN', 'WHEELS_OFF', 'WHEELS_ON', 'ARR_DELAY', 'DEP_DELAY', 'ACTUAL_ELAPSED_TIME', 'DEP_TIME', 'ARR_TIME'

In [3]:
df = df.drop('FL_DATE','TAXI_OUT','WHEELS_OFF','WHEELS_ON','TAXI_IN','ARR_DELAY','DEP_DELAY','ACTUAL_ELAPSED_TIME','DEP_TIME','ARR_TIME')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
df.columns
print("Number of Columns: ", len(df.columns))
print("Number of Rows:", df.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['OP_CARRIER', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'MONTH', 'WEEKDAY', 'YEAR', 'FLIGHT_STATUS']

# Data Preprocessing

In [7]:
from pyspark.ml.feature import StringIndexer,VectorAssembler,VectorIndexer,StandardScaler

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# StringIndexer

- Converting Categorical Columns such as 'OP_CARRIER', 'ORIGIN', 'DEST' are converted into Indexed Columns 'OP_CARRIER_I', 'ORIGIN_I', 'DEST_I' using StringIndexer.
- StringIndexer is used to convert categorical columns to numeric.

In [8]:
indexer1 = StringIndexer(inputCol='OP_CARRIER',outputCol='OP_CARRIER_I')
strindexedDF1 = indexer1.fit(df).transform(df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
indexer2 = StringIndexer(inputCol='ORIGIN',outputCol='ORIGIN_I')
strindexedDF2 = indexer2.fit(strindexedDF1).transform(strindexedDF1)

In [None]:
indexer3 = StringIndexer(inputCol='DEST',outputCol='DEST_I')
strindexedDF3 = indexer3.fit(strindexedDF2).transform(strindexedDF2)

In [9]:
strindexedDF3.select('OP_CARRIER','OP_CARRIER_I','ORIGIN','ORIGIN_I','DEST','DEST_I').show(5,False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+------+----+------------+------------+----------------+--------+--------+-----+-------+----+-------------+------------+--------+------+
|OP_CARRIER     |ORIGIN|DEST|CRS_DEP_TIME|CRS_ARR_TIME|CRS_ELAPSED_TIME|AIR_TIME|DISTANCE|MONTH|WEEKDAY|YEAR|FLIGHT_STATUS|OP_CARRIER_I|ORIGIN_I|DEST_I|
+---------------+------+----+------------+------------+----------------+--------+--------+-----+-------+----+-------------+------------+--------+------+
|United Airlines|ORD   |OMA |3           |0           |87.0            |66.0    |416.0   |6    |2      |2011|1            |4.0         |1.0     |60.0  |
|United Airlines|DEN   |IAD |2           |3           |198.0           |171.0   |1452.0  |6    |2      |2011|0            |4.0         |3.0     |27.0  |
|United Airlines|RNO   |DEN |2           |2           |129.0           |99.0    |804.0   |6    |2      |2011|0            |4.0         |66.0    |3.0   |
|United Airlines|LAX   |SFO |1           |1           |86.0            |54.0    |3

In [10]:
df.columns

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['OP_CARRIER', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'MONTH', 'WEEKDAY', 'YEAR', 'FLIGHT_STATUS', 'OP_CARRIER_I', 'ORIGIN_I', 'DEST_I']

# VectorAssembler

- Machine Learning models in Spark expects all features in single column. Therefore VectorAssembler combines all features and gives us vector which can be stored in single column. 
- VectorAssembler expects only Numerical Features, hence we do not take into account 'OP_CARRRIER', 'ORIGIN' and 'DEST'.

In [11]:
# Create a list of all the variables that you want to create feature vectors
# These features are then further used for training model
features_col = ['CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 
                'MONTH', 'WEEKDAY', 'OP_CARRIER_I', 'ORIGIN_I', 'DEST_I']

assembler = VectorAssembler(inputCols= features_col, outputCol= "features")
assembledDF = assembler.transform(strindexedDF3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
assembledDF.select("features").show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------------------------------------+
|features                                         |
+-------------------------------------------------+
|[3.0,0.0,87.0,66.0,416.0,6.0,2.0,4.0,1.0,60.0]   |
|[2.0,3.0,198.0,171.0,1452.0,6.0,2.0,4.0,3.0,27.0]|
|[2.0,2.0,129.0,99.0,804.0,6.0,2.0,4.0,66.0,3.0]  |
|[1.0,1.0,86.0,54.0,337.0,6.0,2.0,4.0,4.0,7.0]    |
|[2.0,3.0,311.0,264.0,2419.0,6.0,2.0,4.0,7.0,27.0]|
+-------------------------------------------------+
only showing top 5 rows

# VectorIndexer

- After VectorAssembler next step is VectorIndexer. 
- Vector Indexer Automatically identifies categorical features from the feature vector (Output Column of VectorAssembler) and then indexes those categorical features inside vector. 
- VectorIndexer let usskip OneHotEncoding stage for encoding categorical features.

In [13]:
vecindexer = VectorIndexer(inputCol= "features", outputCol= "indexed_features")
vecindexedDF = vecindexer.fit(assembledDF).transform(assembledDF)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
vecindexedDF.select("features", "indexed_features").show(20, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------------------------------------------+-------------------------------------------------+
|features                                         |indexed_features                                 |
+-------------------------------------------------+-------------------------------------------------+
|[3.0,0.0,87.0,66.0,416.0,6.0,2.0,4.0,1.0,60.0]   |[3.0,0.0,87.0,66.0,416.0,5.0,1.0,4.0,1.0,60.0]   |
|[2.0,3.0,198.0,171.0,1452.0,6.0,2.0,4.0,3.0,27.0]|[2.0,3.0,198.0,171.0,1452.0,5.0,1.0,4.0,3.0,27.0]|
|[2.0,2.0,129.0,99.0,804.0,6.0,2.0,4.0,66.0,3.0]  |[2.0,2.0,129.0,99.0,804.0,5.0,1.0,4.0,66.0,3.0]  |
|[1.0,1.0,86.0,54.0,337.0,6.0,2.0,4.0,4.0,7.0]    |[1.0,1.0,86.0,54.0,337.0,5.0,1.0,4.0,4.0,7.0]    |
|[2.0,3.0,311.0,264.0,2419.0,6.0,2.0,4.0,7.0,27.0]|[2.0,3.0,311.0,264.0,2419.0,5.0,1.0,4.0,7.0,27.0]|
|[1.0,1.0,324.0,291.0,2288.0,6.0,2.0,4.0,27.0,4.0]|[1.0,1.0,324.0,291.0,2288.0,5.0,1.0,4.0,27.0,4.0]|
|[1.0,1.0,127.0,104.0,651.0,6.0,2.0,4.0,5.0,7.0]  |[1.0,1.0,127.0,104.0,651.0,5.0,

In [15]:
vecindexedDF.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- OP_CARRIER: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- AIR_TIME: double (nullable = true)
 |-- DISTANCE: double (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- WEEKDAY: integer (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- FLIGHT_STATUS: integer (nullable = true)
 |-- OP_CARRIER_I: double (nullable = false)
 |-- ORIGIN_I: double (nullable = false)
 |-- DEST_I: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- indexed_features: vector (nullable = true)

# StandardScaler

- StandardScaler scales each value in the feature vector such that the mean is 0 and the standard deviation is 1
- It takes parameters:
    - withStd: True by default. Scales the data to unit standard deviation
    - withMean: False by default. Centers the data with mean before scaling

In [16]:
stdscaler = StandardScaler(inputCol= "indexed_features", outputCol= "scaledfeatures")
stdscaledDF = stdscaler.fit(vecindexedDF).transform(vecindexedDF)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
stdscaledDF.select("scaledfeatures" ).show(5, False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|scaledfeatures                                                                                                                                                                                |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[3.7759086415361205,0.0,1.1849301019673635,0.9341508357077831,0.7000001868483029,1.4752922180737393,0.5104440799032741,0.827719427843689,0.02166320217560367,1.2991170971063506]              |
|[2.517272427690747,3.6297005740292247,2.6967374734429654,2.42029989251562,2.4432698829416726,1.4752922180737393,0.5104440799032741,0.827719427843689,0.06498960652681102,0.5846026936978578]  |
|[2.517272427690747,2.4198003826861

In [18]:
stdscaledDF.write.option("header","true").parquet("s3://project.group23.fan5.flight/Scaled_Data_M/")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Train Test Split

We are going to train the model with all data except YEAR 2018 data because that is used to test the model.

In [27]:
#test data on 2018 
from pyspark.sql.functions import col
trainDF = stdscaledDF.where(col("YEAR")!=2018)
testDF = stdscaledDF.where(col("YEAR")==2018)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
trainDF.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- OP_CARRIER: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- AIR_TIME: double (nullable = true)
 |-- DISTANCE: double (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- WEEKDAY: integer (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- FLIGHT_STATUS: integer (nullable = true)
 |-- OP_CARRIER_I: double (nullable = false)
 |-- ORIGIN_I: double (nullable = false)
 |-- DEST_I: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- indexed_features: vector (nullable = true)
 |-- scaledfeatures: vector (nullable = true)

In [29]:
# print the count of observations in each set
print("NUmber of Observations in training set = ", trainDF.count())
print("Number of Observations in testing set = ", testDF.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Observations in training set =  53613134
Observations in testing set =  7076405

# Model Building

In [31]:
# import the RandomForestClassifier function from the pyspark.ml.classification package
from pyspark.ml.classification import RandomForestClassifier

# Build the RandomForestClassifier object 'dt' by setting the required parameters
# We will pass the VectorIndexed columns as featureCol for Random Forest. Since they can handle categorical indexes
rf = RandomForestClassifier(featuresCol="scaledfeatures", labelCol="FLIGHT_STATUS" , numTrees=100, maxDepth= 5)


An error was encountered:
Session 2 unexpectedly reached final status 'dead'. See logs:
stdout: 

stderr: 
.
22/03/26 12:59:18 WARN TaskSetManager: Lost task 3.0 in stage 51.0 (TID 287, ip-172-31-77-8.ec2.internal, executor 12): ExecutorLostFailure (executor 12 exited caused by one of the running tasks) Reason: Container from a bad node: container_1648292310456_0003_01_000024 on host: ip-172-31-77-8.ec2.internal. Exit status: 137. Diagnostics: [2022-03-26 12:59:18.552]Container killed on request. Exit code is 137
[2022-03-26 12:59:18.552]Container exited with a non-zero exit code 137. 
[2022-03-26 12:59:18.553]Killed by external signal
.
22/03/26 12:59:18 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 12 for reason Container from a bad node: container_1648292310456_0003_01_000024 on host: ip-172-31-77-8.ec2.internal. Exit status: 137. Diagnostics: [2022-03-26 12:59:18.552]Container killed on request. Exit code is 137
[2022-03-26 12:59:18.552]Conta

In [None]:
# fit the RandomForestClassifier object on the training data
rfmodel = rf.fit(trainDF)

In [None]:
#This RandomForestClassifierModel can be used as a transformer to perform prediction on the testing data
rfpredictonDF = rfmodel.transform(testDF)

In [None]:
rfpredictonDF.select("label","rawPrediction", "probability", "prediction").show(10,False)

In [None]:
from pyspark.ml.evaluation import Multi

In [None]:
# 1. Accuracy
print("Accuracy: ", multievaluator.evaluate(rfpredictonDF, {evaluator.metricName: "accuracy"})) 
# 2. Area under the ROC curve
print('Area under the ROC curve = ', evaluator.evaluate(rfpredictonDF))
# 3. Precision (Positive Predictive Value)
print("Precision = ", multievaluator.evaluate(rfpredictonDF, {evaluator.metricName: "weightedPrecision"}))
# 4. Recall (True Positive Rate)
print("Recall = ", multievaluator.evaluate(rfpredictonDF, {evaluator.metricName: "weightedRecall"}))
# 5. F1 Score (F-measure)
print("F1 Score = ", multievaluator.evaluate(rfpredictonDF, {evaluator.metricName: "f1"}))