In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import plotly.express as px

In [None]:
dados_limitados=dados.limit(10000000)

In [None]:
dados_limitados = ( dados_limitados
            .withColumn("Delay", 
                                F.when((F.col("ArrDelay") > 15), 1).otherwise(0))
)


In [None]:
dados_limitados = dados_limitados.drop('Operated_or_Branded_Code_Share_Partners','DepDelay','ArrDelay'
                                       ,'ArrTime' ,'TaxiIn','WheelsOn')
dados_limitados.show(10)

In [None]:
import pyspark.sql.types as T

# The columns at stake
cols_non_numeric = [field.name for field in dados_limitados.schema.fields if isinstance(
    field.dataType, T.TimestampType) or isinstance(field.dataType, T.StringType)]
cols_numeric = [col for col in dados_limitados.columns if col not in cols_non_numeric]

# Recall columns at stake
print(f'Non-numeric columns: {cols_non_numeric}')
print(f'Numeric columns: {cols_numeric}')

In [None]:
# Set which columns not to be used as features. 
cols_not_features = ['Delay']

# Set columns to be used by StringIndexer() and OneHotEncoder()

categorical_cols = [i for i in cols_non_numeric if i not in cols_not_features]
non_categorical_cols = [i for i in cols_numeric if i not in cols_not_features]
index_output_cols = [x + ' Index' for x in categorical_cols]
ohe_output_cols = [x + ' OHE' for x in categorical_cols]


In [None]:
# Assembling an array with the features to be used by the algorithm,
# with the help of StringIndexer(), OneHotEncoder() and vectorAssembler()
string_indexer = StringIndexer(inputCols=categorical_cols, outputCols=index_output_cols, handleInvalid="skip")
ohe_encoder = OneHotEncoder(inputCols=index_output_cols, outputCols=ohe_output_cols)

# Put all input features into a single vector, by using a transformer
assembler_inputs = ohe_output_cols + non_categorical_cols
vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

print(f'Input features to be used (OHE were categorical):\n {assembler_inputs}')

In [None]:
# Train/validation split
# Two dataframes for training and validation respectively, with a split size of 70/30 (%)
df_train, df_validation = dados_limitados.randomSplit([0.7, 0.3], 42)

In [None]:

df_train.write.mode('overwrite').parquet("trans-train_total")
df_validation.write.mode('overwrite').parquet("trans-val_total")

In [None]:
# As we already got the data split, delete df_clean to free memory space
del dados_limitados

In [None]:
# Linear SVC algorithm
lsvc = LinearSVC(maxIter=10, regParam=0.1, labelCol='Delay')

In [None]:
# Set up a ML pipeline configuration, holding the sequence of the four stages previously set:
# 1. string_indexer
# 2. ohe_encoder
# 3. vec_assembler (related to assembling features into vector)
# 4. lsvc (related to ML estimator)

pipeline = Pipeline(stages=[string_indexer,ohe_encoder,vec_assembler,lsvc])

In [None]:
# Save in the pipeline for further use, should it be required
pipeline.save('pipeline-LinearSVM_total')

In [None]:
# A
model = pipeline.fit(df_train)

In [None]:
# Save the model for further use, should it be required.
model.save('model-LinearSVM_total')

In [None]:
# Make predictions by applying the verification data to the transformer
df_predictions = model.transform(df_validation)

In [None]:
# Columns of interest: features, rawPrediction, prediction, Fraud
df_predictions_eval = df_predictions.select('features', 
                    'rawPrediction', 'prediction', 'Delay')

binary_evaluator = BinaryClassificationEvaluator(labelCol='Delay',
                                                 rawPredictionCol='rawPrediction',
                                                 metricName='areaUnderROC')
    
area_under_ROC = binary_evaluator.evaluate(df_predictions_eval)

# Print out result
print(f'Metric areaUnderROC = {area_under_ROC}')
#df_predictions_eval.count()

In [None]:
# Counting of the kind of predictions made
df_confusion_matrix = df_predictions_eval.groupBy('prediction','Delay').count()
df_confusion_matrix.show()

In [None]:
# Compute the confusion matrix
tp = df_confusion_matrix.filter((F.col('prediction')==1.0) & (F.col('Delay')==1)).first()
tn = df_confusion_matrix.filter((F.col('prediction')==0.0) & (F.col('Delay')==0)).first()
fp = df_confusion_matrix.filter((F.col('prediction')==1.0) & (F.col('Delay')==0)).first()
fn = df_confusion_matrix.filter((F.col('prediction')==0.0) & (F.col('Delay')==1)).first()

confmat = {'TP': 0.0, 'TN': 0.0, 'FP': 0.0, 'FN': 0.0}
if (tp):
    confmat['TP'] = tp['count'] * 1.0
if (tn):
    confmat['TN'] = tn['count'] * 1.0
if (fp):
    confmat['FP'] = fp['count'] * 1.0
if (fn):
    confmat['FN'] = fn['count'] * 1.0

confmat

In [None]:
# Based on the confusion matrix, computed the evaluation matrics:
#   accuracy, precision, recall, specifity and F1 score

# PS: Check divisons by 0.0
accuracy = (confmat['TP'] + confmat['TN']) / (confmat['TP'] + confmat['TN'] + confmat['FP'] + confmat['FN'])
precision = confmat['TP'] / (confmat['TP'] + confmat['FP'])
recall = confmat['TP'] / (confmat['TP'] + confmat['FN'])
specificity = confmat['TN'] / (confmat['TN'] + confmat['FP'])
f1score = 2 * (precision * recall) / (precision + recall)


print('Evaluation metrics based on the confusion matrix:')
print(f' Accuracy = {accuracy}')
print(f' Precision = {precision}')
print(f' Recall = {recall}')
print(f' Specifity = {specificity}')
print(f' F1 score = {f1score}')