# Prediction

In [8]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import sys
import datetime
from pyspark.sql import SparkSession
from pyspark import SparkFiles
import pyspark.sql.functions as F
import pyspark.sql.types as T 
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [9]:
# Create a Spark session now to access functionalities of Spark
spark = (SparkSession.builder.appName('Preprocessing')
.master("spark://10.131.7.106:7077")
.config("spark.driver.memory", "20G")
.config("spark.executor.cores", "2")
.config("spark.driver.cores", "2")
.config("spark.executor.pyspark.memory", "30G")
.config("spark.dynamicAllocation.enabled", "true")
.getOrCreate() 
)

spark

In [10]:
df = spark.read.parquet("data/Clean_Data/clean_all")
# df = spark.read.parquet("data/Clean_Data/clean_0.1")
# df = spark.read.parquet("data/Clean_Data/clean_0.01")

## Preprocesing for ML

In [11]:
# Drop the cols which indirectly indicate if a flight is cancelled or not (apart from the column CANCELLED)
# Most of those cols contain null values, if the flight is cancelled

df = df.drop("CARRIER_DELAY", 
                        "WEATHER_DELAY",
                        "NAS_DELAY",
                        "SECURITY_DELAY",
                        "LATE_AIRCRAFT_DELAY",
                        "CANCELLATION_CODE",
                        "DEP_TIME",
                        "DEP_DELAY",
                        "TAXI_OUT",
                        "WHEELS_OFF",
                        "WHEELS_ON",
                        "TAXI_IN",
                        "ARR_TIME",
                        "ARR_DELAY",
                        "ACTUAL_ELAPSED_TIME", 
                        "AIR_TIME")

# classify_df = classify_df.withColumn("FL_DATE", F.unix_timestamp("FL_DATE"))

In [12]:
# # Take a subset: either balanced (with subsampling) or unbalanced
# # we take a subset, because of memory limitations

# # select subsample of positive samples - 10%
# pos_df = classify_df.filter(F.col('CANCELLED').isin(1)).sample(fraction=0.1)
# # select an equal amount of negative samples (number of neg samples == number of pos samples)
# neg_df = classify_df.filter(F.col('CANCELLED').isin(0)).orderBy(F.rand()).limit(pos_df.count())


# # Combine pos_df and neg_df - 171146 rows
# classify_df = pos_df.union(neg_df).sample(fraction=1.0).cache()
# classify_df.show(5)

In [13]:
# Define StringIndexer: categorical (string) cols -> to column indices, 
# Each category gets a integer based on their frequency (start from 0)

carrier_indexer = StringIndexer(inputCol="OP_CARRIER", outputCol="OP_CARRIER_Index")
origin_indexer = StringIndexer(inputCol="ORIGIN", outputCol="ORIGIN_Index")
dest_indexer = StringIndexer(inputCol="DEST", outputCol="DEST_Index")

In [14]:
# Define onehotencoder for a index columns 
onehotencoder_carrier_vector = OneHotEncoder(inputCol="OP_CARRIER_Index", outputCol="OP_CARRIER_vec")
onehotencoder_origin_vector = OneHotEncoder(inputCol="ORIGIN_Index", outputCol="ORIGIN_vec")
onehotencoder_dest_vector = OneHotEncoder(inputCol="DEST_Index", outputCol="DEST_vec")

In [15]:
# Pipelining the preprocessing stages defined above 
pipeline = Pipeline(stages=[carrier_indexer, origin_indexer, dest_indexer,
                            onehotencoder_carrier_vector, onehotencoder_origin_vector,
                            onehotencoder_dest_vector])

transformed_df = pipeline.fit(df).transform(df)

In [17]:
transformed_df.show(n=1, vertical = True)

-RECORD 0------------------------------------------
 FL_DATE                     | 2016-01-01 00:00:00 
 OP_CARRIER                  | DL                  
 OP_CARRIER_FL_NUM           | 1248                
 ORIGIN                      | DTW                 
 DEST                        | LAX                 
 CRS_DEP_TIME                | 2016-01-01 19:35:00 
 CRS_ARR_TIME                | 2016-01-01 21:44:00 
 CANCELLED                   | 0.0                 
 DIVERTED                    | 0.0                 
 CRS_ELAPSED_TIME            | 309.0               
 DISTANCE                    | 1979.0              
 CANCELLATION_CODE_EXPLAINED | null                
 OP_CARRIER_Index            | 1.0                 
 ORIGIN_Index                | 9.0                 
 DEST_Index                  | 4.0                 
 OP_CARRIER_vec              | (20,[1],[1.0])      
 ORIGIN_vec                  | (359,[9],[1.0])     
 DEST_vec                    | (357,[4],[1.0])     
only showing

In [18]:
# Select columns that are combined to one feature column
feature_columns = transformed_df.columns

# Remove cols that whould not be in our feature cols (label col, intermediate preprocessing cols)
for item in ["CANCELLED", "ORIGIN", "DEST", "OP_CARRIER", "OP_CARRIER_Index", "ORIGIN_Index", "DEST_Index"]:
    feature_columns.remove(item)


assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Build feature col
assembled_df = assembler.transform(transformed_df)

IllegalArgumentException: 'Data type timestamp of column FL_DATE is not supported.\nData type timestamp of column CRS_DEP_TIME is not supported.\nData type timestamp of column CRS_ARR_TIME is not supported.\nData type string of column CANCELLATION_CODE_EXPLAINED is not supported.'

In [None]:
# Select only feature and label column
final_classify_df = assembled_df.select("features", F.col("CANCELLED").alias("label"))

In [None]:
final_classify_df.printSchema()

In [None]:
train, test = final_classify_df.randomSplit([.7, .3], seed=9) # 70, 30 split on balanced set or on subset of samples

In [None]:
spark.catalog.clearCache()
# caching data into memory - models run quicker
train = train.repartition(32).cache()
test = test.repartition(32).cache()### Train models

## Prediction Models

In [None]:
# Define the models
decision_tree = DecisionTreeClassifier(labelCol = 'label', featuresCol = 'features')
rand_forest = RandomForestClassifier(labelCol = 'label', featuresCol = 'features')
gbt = GBTClassifier(labelCol = 'label', featuresCol = 'features')

decision_tree_model = decision_tree.fit(train)

rand_forest_model = rand_forest.fit(train)

gbt_model = gbt.fit(train)