---
# Imports

In [1]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.clustering import KMeans
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
from pyspark.ml import Pipeline
import plotly.express as px
import seaborn as sns
import pandas as pd
import numpy as np
import pyspark

In [2]:
spark = SparkSession.builder.appName("ProjetoABD").getOrCreate()

In [3]:
#dados = spark.read.load('avioes/raw',format='csv',sep=',',inferSchema=True, header=True)

dados = spark.read.load('/home/jovyan/code/raw',format='csv',sep=',',inferSchema=True, header=True)

In [None]:
dados.printSchema()
dados.show()
dados.count()

-----
# Data Cleaning

## 1.Removing duplicates if any

In [None]:
print(f'Dados: number of rows: {dados.count()}, after dropduplicates: {dados.dropDuplicates().count() }')

## 2.Removing useless columns

In [4]:
cols_to_dismiss = [
    # Informações duplicadas ou derivadas
    'DepDelayMinutes', 'DepDel15', 'DepartureDelayGroups',
    'ArrDelayMinutes', 'ArrDel15', 'ArrivalDelayGroups',
    'DepTimeBlk','ArrTimeBlk','DestAirportID','DestAirportSeqID',
    'DestCityMarketID','OriginAirportID','OriginAirportSeqID',
    'OriginCityMarketID','Marketing_Airline_Network',
    'Operating_Airline','Flight_Number_Marketing_Airline',
    'FlightDate','OriginWac','DestWac','Flights',
    'Duplicate',"OriginStateFips","DestStateFips",'OriginState',
    "DestState",

    # Diverted flights – detalhes específicos desnecessários
    'DivAirportLandings', 'DivReachedDest', 'DivActualElapsedTime',
    'DivArrDelay', 'DivDistance',

    # Todas as colunas relacionadas com aeroportos desviados 
    'Div1Airport', 'Div1AirportID', 'Div1AirportSeqID', 'Div1WheelsOn',
    'Div1TotalGTime', 'Div1LongestGTime', 'Div1WheelsOff', 'Div1TailNum',
    'Div2Airport', 'Div2AirportID', 'Div2AirportSeqID', 'Div2WheelsOn',
    'Div2TotalGTime', 'Div2LongestGTime', 'Div2WheelsOff', 'Div2TailNum',
    'Div3Airport', 'Div3AirportID', 'Div3AirportSeqID', 'Div3WheelsOn',
    'Div3TotalGTime', 'Div3LongestGTime', 'Div3WheelsOff', 'Div3TailNum',
    'Div4Airport', 'Div4AirportID', 'Div4AirportSeqID', 'Div4WheelsOn',
    'Div4TotalGTime', 'Div4LongestGTime', 'Div4WheelsOff', 'Div4TailNum',
    'Div5Airport', 'Div5AirportID', 'Div5AirportSeqID', 'Div5WheelsOn',
    'Div5TotalGTime', 'Div5LongestGTime', 'Div5WheelsOff', 'Div5TailNum',

    # Identificadores redundantes
    'DOT_ID_Marketing_Airline', 'DOT_ID_Operating_Airline','DayofMonth',
    'IATA_Code_Marketing_Airline', 'IATA_Code_Operating_Airline','Tail_Number',

    #dados com mts nulls
    'CancellationCode','Originally_Scheduled_Code_Share_Airline','DOT_ID_Originally_Scheduled_Code_Share_Airline',
    'IATA_Code_Originally_Scheduled_Code_Share_Airline','Flight_Num_Originally_Scheduled_Code_Share_Airline','CarrierDelay',
    'WeatherDelay','NASDelay','SecurityDelay','FirstDepTime','TotalAddGTime','LongestAddGTime','_c119','LateAircraftDelay'
]

dados = dados.drop(*cols_to_dismiss)

In [None]:
dados.printSchema()
dados.show()

## Data transformation

In [5]:
# Converter hhmm para minutos
def hhmm_to_minutes(col):
    return (F.floor(col / 100) * 60) + (col % 100)

# Tempos de partida
dados = dados.withColumn("DepTime", hhmm_to_minutes(F.col("DepTime")))
dados = dados.withColumn("CRSDepTime", hhmm_to_minutes(F.col("CRSDepTime")))
dados = dados.withColumn("DepDelay",F.col("DepTime") - F.col("CRSDepTime"))

# Tempos de chegada
dados = dados.withColumn("ArrTime", hhmm_to_minutes(F.col("ArrTime")))
dados = dados.withColumn("CRSArrTime", hhmm_to_minutes(F.col("CRSArrTime")))
dados = dados.withColumn("ArrDelay", F.col("ArrTime") - F.col("CRSArrTime"))
# F.greatest(F.col("ArrTime") - F.col("CRSArrTime"), F.lit(0))

In [6]:
# Meter a localizaçao da partida e destino numa só tabela
dados = dados.withColumn('OriginCityNameState',
                          F.concat_ws(
                              ',',
                                F.split(F.col('OriginCityName'),',')[0],
                                F.col("OriginStateName")
    ))

dados = dados.withColumn('DestCityNameState',
                          F.concat_ws(
                              ',',
                                F.split(F.col('DestCityName'),',')[0],
                                F.col("DestStateName")
    ))

cols_to_dismiss = ["OriginCityName","OriginStateName","DestCityName","DestStateName"]
dados = dados.drop(*cols_to_dismiss)

In [7]:
dados = dados.withColumnRenamed("Operating_Airline ","Operating_Airline")

In [None]:
dados.printSchema()
dados.show()

## 3.Handling missing values

In [None]:
# Ver em que colunas estão os nulos (by columns)
nulls = {col: dados.filter(dados[col].isNull()).count() for col in dados.columns}
nulls

# valores corridos ao dataset todo sem fazer nada aos nulls
'''
{'Year': 0,
 'Quarter': 0,
 'Month': 0,
 'DayOfWeek': 0,
 'Operated_or_Branded_Code_Share_Partners': 0,
 'Operating_Airline ': 0,
 'Flight_Number_Operating_Airline': 0,
 'Origin': 0,
 'Dest': 0,
 'CRSDepTime': 0,
 'DepTime': 761652,
 'DepDelay': 761652,
 'TaxiOut': 780561,
 'WheelsOff': 780551,
 'WheelsOn': 793133,
 'TaxiIn': 793143,
 'CRSArrTime': 0,
 'ArrTime': 786177,
 'ArrDelay': 786177,
 'Cancelled': 0,
 'Diverted': 0,
 'CRSElapsedTime': 22,
 'ActualElapsedTime': 845637,
 'AirTime': 852561,
 'Distance': 0,
 'DistanceGroup': 0,
 'OriginCityNameState': 0,
 'DestCityNameState': 0}
'''

In [8]:
# remover nulls
dados = dados.dropna(how='any')

In [10]:
dados.printSchema()
dados.show()
dados.count()

root
 |-- Year: integer (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Operated_or_Branded_Code_Share_Partners: string (nullable = true)
 |-- Operating_Airline: string (nullable = true)
 |-- Flight_Number_Operating_Airline: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- CRSDepTime: long (nullable = true)
 |-- DepTime: long (nullable = true)
 |-- DepDelay: long (nullable = true)
 |-- TaxiOut: double (nullable = true)
 |-- WheelsOff: integer (nullable = true)
 |-- WheelsOn: integer (nullable = true)
 |-- TaxiIn: double (nullable = true)
 |-- CRSArrTime: long (nullable = true)
 |-- ArrTime: long (nullable = true)
 |-- ArrDelay: long (nullable = true)
 |-- Cancelled: double (nullable = true)
 |-- Diverted: double (nullable = true)
 |-- CRSElapsedTime: double (nullable = true)
 |-- ActualElapsedTime: double (nullable = true)
 |-- AirTime: 

28341220

-----
# Gráficos

### Matriz de correlação

In [None]:
# colunas numericas
input_cols_num = [
    "Year",
    "Quarter",
    "Month",
    "DayOfWeek",
    "Flight_Number_Operating_Airline",
    "CRSDepTime",
    "DepTime",
    "DepDelay",
    "TaxiOut",
    "WheelsOff",
    "Distance",
    "DistanceGroup",
    "AirTime",
    "CRSArrTime",
    "ArrTime",
    "ArrDelay",
    "WheelsOn",
    "TaxiIn",
    "Cancelled",
    "Diverted",
    "CRSElapsedTime",
    "ActualElapsedTime"
]


# colunas categoricas
input_cols_str = [
    "Operated_or_Branded_Code_Share_Partners",
    "Operating_Airline",
    "Origin",
    "Dest",
    "OriginCityNameState",
    "DestCityNameState"
]

In [None]:
assembler = VectorAssembler(inputCols=input_cols_num, outputCol="numeric_features")
numeric_df = assembler.transform(dados).select("numeric_features")

correlation_matrix = Correlation.corr(numeric_df, "numeric_features", "pearson").head()[0].toArray()

corr_df = pd.DataFrame(correlation_matrix, columns=input_cols_num, index=input_cols_num)

plt.figure(figsize=(15, 12))
sns.heatmap(corr_df, annot=True, cmap="coolwarm", fmt=".2f", square=True)
plt.title("Matriz de correlação das features numericas")
plt.tight_layout()
plt.show()

### cenas do dia da semana

In [None]:
df_small = dados.limit(1000000)
df = df_small.toPandas()

sns.set(style="whitegrid")



plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='DayOfWeek', y='ArrDelay')
plt.title('Atraso na Chegada por Dia da Semana')
plt.show()

df_filtrado = df[df['ArrDelay'] > 15]
contagem = df_filtrado.groupby('DayOfWeek').size().reset_index(name='QtdAtrasos')

plt.figure(figsize=(8, 5))
sns.barplot(data=contagem, x='DayOfWeek', y='QtdAtrasos', palette='Blues_d')
plt.title('Voos com Atraso na Chegada > 15 min por Dia da Semana')
plt.xlabel('Dia da Semana')
plt.ylabel('Quantidade de Voos')
plt.show()



plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='DayOfWeek', y='DepDelay')
plt.title('Atraso na Partida por Dia da Semana')
plt.show()


df_filtrado = df[df['DepDelay'] > 15]
contagem = df_filtrado.groupby('DayOfWeek').size().reset_index(name='QtdAtrasos')

plt.figure(figsize=(8, 5))
sns.barplot(data=contagem, x='DayOfWeek', y='QtdAtrasos', palette='Blues_d')
plt.title('Voos com Atraso na partida > 15 min por Dia da Semana')
plt.xlabel('Dia da Semana')
plt.ylabel('Quantidade de Voos')
plt.show()

### KMeans

In [None]:
# Encode das colunas categoricas
available_cols = set(dados.columns)
indexers = [StringIndexer(inputCol=col, outputCol=col + "_Index", handleInvalid='keep')
            for col in input_cols_str if col in available_cols]

# juntar as colunas
assembler = VectorAssembler(
    inputCols=[
    "Year","Quarter","Month","DayOfWeek","Flight_Number_Operating_Airline","CRSDepTime","DepTime","TaxiOut",
    "WheelsOff","Distance","DistanceGroup","AirTime","CRSArrTime","ArrTime","WheelsOn","TaxiIn","Cancelled",
    "Diverted","CRSElapsedTime","ActualElapsedTime","Origin_Index", "Dest_Index","Operating_Airline_Index",
    "Operated_or_Branded_Code_Share_Partners_Index", "OriginCityNameState_Index","DestCityNameState_Index"
    ],
    outputCol="features_unscaled"
)

scaler = StandardScaler(inputCol="features_unscaled", outputCol="features", withStd=True, withMean=True)
kmeans = KMeans(k=5, seed=1, featuresCol="features", predictionCol="cluster")
pipeline = Pipeline(stages=indexers + [assembler, scaler, kmeans])

# correr o modelo
model = pipeline.fit(dados)
clustered_df = model.transform(dados)

print("KMeans feito")

# Ver a média do delay
clustered_df.groupBy("cluster").agg(
    {"DepDelay": "avg", "ArrDelay": "avg"}
).show()

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))

delay_summary = clustered_df.groupBy("cluster").agg(
    {"DepDelay": "avg", "ArrDelay": "avg"}
).toPandas()

x = delay_summary["cluster"]
width = 0.35

ax.bar(x - width/2, delay_summary["avg(DepDelay)"], width, label="Avg DepDelay", color='skyblue')
ax.bar(x + width/2, delay_summary["avg(ArrDelay)"], width, label="Avg ArrDelay", color='salmon')

ax.set_xlabel("Cluster")
ax.set_ylabel("Delay (minutos)")
ax.set_title("Média do ArrDelay e DepDelay por cluster")
ax.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# features dos clusters mais os clusters
cols_to_collect = [
    "Year","Quarter","Month","DayOfWeek","Flight_Number_Operating_Airline","CRSDepTime",
    "DepTime","TaxiOut","WheelsOff","Distance","DistanceGroup","AirTime","CRSArrTime",
    "ArrTime","WheelsOn","TaxiIn","Cancelled","Diverted","CRSElapsedTime","ActualElapsedTime",
    "Origin_Index","Dest_Index","Operating_Airline_Index","Operated_or_Branded_Code_Share_Partners_Index",
    "OriginCityNameState_Index","DestCityNameState_Index","cluster"
]

sampled = clustered_df.select(*cols_to_collect).sample(fraction=0.05, seed=42)
pdf = sampled.toPandas()

global_mean = pdf.drop(columns=["cluster"]).mean()
cluster_means = pdf.groupby("cluster").mean()

differences = cluster_means - global_mean

plt.figure(figsize=(20, 10))
sns.heatmap(differences.T, cmap="coolwarm", center=0, annot=True, fmt=".2f")
plt.title("Diferenças das médias das features por cluster")
plt.xlabel("Cluster")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

### Relevância das features

In [None]:
categorical_cols = [
    #"Operated_or_Branded_Code_Share_Partners",
    "Operating_Airline",
    "Origin",
    "Dest",
    #"OriginCityNameState",
    #"DestCityNameState"
]

indexers = [StringIndexer(inputCol=c, outputCol=c + "_Index", handleInvalid="keep") 
            for c in categorical_cols]


all_features = [
    "CRSDepTime", "DepTime", "TaxiOut", "WheelsOff", "WheelsOn", "TaxiIn",
    "CRSArrTime", "ArrTime", "AirTime", "Distance", "CRSElapsedTime", "ActualElapsedTime",
    "Year","Quarter","Month", "DayOfWeek"
] + [c + "_Index" for c in categorical_cols]

assembler = VectorAssembler(inputCols=all_features, outputCol="features")

In [None]:
rf = RandomForestRegressor(featuresCol="features", labelCol="DepDelay", numTrees=40, seed=42,maxBins=400)
pipeline = Pipeline(stages=indexers + [assembler, rf])

rf_model = pipeline.fit(dados)
importances = rf_model.stages[-1].featureImportances.toArray()

In [None]:
feature_names = all_features
importance_series = pd.Series(importances, index=feature_names).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=importance_series.values, y=importance_series.index, palette="magma")
plt.title("Importancia dos features no DepDelay")
plt.tight_layout()
plt.show()

In [None]:
rf = RandomForestRegressor(featuresCol="features", labelCol="ArrDelay", numTrees=40, seed=42,maxBins=400)
pipeline = Pipeline(stages=indexers + [assembler, rf])

rf_model = pipeline.fit(dados)
importances = rf_model.stages[-1].featureImportances.toArray()

In [None]:
feature_names = all_features
importance_series = pd.Series(importances, index=feature_names).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=importance_series.values, y=importance_series.index, palette="magma")
plt.title("Importancia dos features no ArrDelay")
plt.tight_layout()
plt.show()

-----
# Modelagem Bruno

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import plotly.express as px

In [10]:
dados_limitados=dados.limit(10000000)
#dados_limitados.count()


In [None]:
dados_limitados = ( dados_limitados
            .withColumn("Delay", 
                                F.when((F.col("ArrDelay") > 15), 1).otherwise(0))
)


In [12]:
dados_limitados = dados_limitados.drop('Operated_or_Branded_Code_Share_Partners','DepDelay','ArrDelay'
                                       ,'ArrTime' ,'TaxiIn','WheelsOn')
dados_limitados.show(10)

+----+-------+-----+---------+-----------------+-------------------------------+------+----+----------+-------+-------+---------+----------+---------+--------+--------------+-----------------+-------+--------+-------------+--------------------+--------------------+-----+
|Year|Quarter|Month|DayOfWeek|Operating_Airline|Flight_Number_Operating_Airline|Origin|Dest|CRSDepTime|DepTime|TaxiOut|WheelsOff|CRSArrTime|Cancelled|Diverted|CRSElapsedTime|ActualElapsedTime|AirTime|Distance|DistanceGroup| OriginCityNameState|   DestCityNameState|Delay|
+----+-------+-----+---------+-----------------+-------------------------------+------+----+----------+-------+-------+---------+----------+---------+--------+--------------+-----------------+-------+--------+-------------+--------------------+--------------------+-----+
|2020|      4|   10|        4|               YV|                           6023|   CLT| IAD|       885|    884|   53.0|     1537|       981|      0.0|     0.0|          96.0|          

In [13]:
import pyspark.sql.types as T

# The columns at stake
cols_non_numeric = [field.name for field in dados_limitados.schema.fields if isinstance(
    field.dataType, T.TimestampType) or isinstance(field.dataType, T.StringType)]
cols_numeric = [col for col in dados_limitados.columns if col not in cols_non_numeric]

# Recall columns at stake
print(f'Non-numeric columns: {cols_non_numeric}')
print(f'Numeric columns: {cols_numeric}')

Non-numeric columns: ['Operating_Airline', 'Origin', 'Dest', 'OriginCityNameState', 'DestCityNameState']
Numeric columns: ['Year', 'Quarter', 'Month', 'DayOfWeek', 'Flight_Number_Operating_Airline', 'CRSDepTime', 'DepTime', 'TaxiOut', 'WheelsOff', 'CRSArrTime', 'Cancelled', 'Diverted', 'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Distance', 'DistanceGroup', 'Delay']


In [14]:
# Set which columns not to be used as features. 
cols_not_features = ['Delay']

# Set columns to be used by StringIndexer() and OneHotEncoder()

categorical_cols = [i for i in cols_non_numeric if i not in cols_not_features]
non_categorical_cols = [i for i in cols_numeric if i not in cols_not_features]
index_output_cols = [x + ' Index' for x in categorical_cols]
ohe_output_cols = [x + ' OHE' for x in categorical_cols]


In [15]:
# Assembling an array with the features to be used by the algorithm,
# with the help of StringIndexer(), OneHotEncoder() and vectorAssembler()
string_indexer = StringIndexer(inputCols=categorical_cols, outputCols=index_output_cols, handleInvalid="skip")
ohe_encoder = OneHotEncoder(inputCols=index_output_cols, outputCols=ohe_output_cols)

# Put all input features into a single vector, by using a transformer
assembler_inputs = ohe_output_cols + non_categorical_cols
vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

print(f'Input features to be used (OHE were categorical):\n {assembler_inputs}')

Input features to be used (OHE were categorical):
 ['Operating_Airline OHE', 'Origin OHE', 'Dest OHE', 'OriginCityNameState OHE', 'DestCityNameState OHE', 'Year', 'Quarter', 'Month', 'DayOfWeek', 'Flight_Number_Operating_Airline', 'CRSDepTime', 'DepTime', 'TaxiOut', 'WheelsOff', 'CRSArrTime', 'Cancelled', 'Diverted', 'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Distance', 'DistanceGroup']


In [16]:
# Train/validation split
# Two dataframes for training and validation respectively, with a split size of 70/30 (%)
df_train, df_validation = dados_limitados.randomSplit([0.7, 0.3], 42)

#print(f'There are {df_train.count()} rows in the training set and {df_validation.count()} rows in the validation set.')



In [18]:
# Save the train/validation sets as parquet files 
# Recall that, because it is a sampling, there is not guarantee of 
# getting the same data split when using the code in a different computer/time. 
# And we may want to reproduce or share the experiments.

df_train.write.mode('overwrite').parquet("trans-train_total")
df_validation.write.mode('overwrite').parquet("trans-val_total")

In [17]:
# As we already got the data split, delete df_clean to free memory space
del dados_limitados

In [18]:
# Linear SVC algorithm
lsvc = LinearSVC(maxIter=10, regParam=0.1, labelCol='Delay')

In [19]:
# Set up a ML pipeline configuration, holding the sequence of the four stages previously set:
# 1. string_indexer
# 2. ohe_encoder
# 3. vec_assembler (related to assembling features into vector)
# 4. lsvc (related to ML estimator)

pipeline = Pipeline(stages=[string_indexer,ohe_encoder,vec_assembler,lsvc])

In [20]:
# Save in the pipeline for further use, should it be required
pipeline.save('pipeline-LinearSVM_total')

In [21]:
# Create the model by fitting the pipeline to the training data
# Notice that the model will be a transformer
#
# Note: in case there are running problems in your computer, set 
# a lower number of rows to be used in model training

# A
model = pipeline.fit(df_train)
# B
#limit_rows = 100000
#model = pipeline.fit(df_train.limit(limit_rows))

In [22]:
# Save the model for further use, should it be required.
model.save('model-LinearSVM_total')


 ## 5.
Evaluate the model 

In [23]:
# Make predictions by applying the verification data to the transformer
df_predictions = model.transform(df_validation)

In [24]:
# Compute the evaluation metrics 
# - areaUnderROC using BinaryClassificationEvaluator
# - accuracy, precision, recall, and f1Measure, using MultilabelClassificationEvaluator

# Using BinaryClassificationEvaluator
# Regardless of using default values or not, it is good practice to
# explicitly specify them, at the least the important ones

# areaUnderROC relates to sensitivity (TP rate) and specificity (FP rate)

# Columns of interest: features, rawPrediction, prediction, Fraud
df_predictions_eval = df_predictions.select('features', 
                    'rawPrediction', 'prediction', 'Delay')

binary_evaluator = BinaryClassificationEvaluator(labelCol='Delay',
                                                 rawPredictionCol='rawPrediction',
                                                 metricName='areaUnderROC')
    
area_under_ROC = binary_evaluator.evaluate(df_predictions_eval)

# Print out result
print(f'Metric areaUnderROC = {area_under_ROC}')
#df_predictions_eval.count()

Metric areaUnderROC = 0.722571973652999


In [25]:
# Counting of the kind of predictions made
df_confusion_matrix = df_predictions_eval.groupBy('prediction','Delay').count()
df_confusion_matrix.show()

+----------+-----+-------+
|prediction|Delay|  count|
+----------+-----+-------+
|       0.0|    0|2415752|
|       0.0|    1| 581636|
|       1.0|    1|     61|
|       1.0|    0|     11|
+----------+-----+-------+



In [26]:
# Compute the confusion matrix
tp = df_confusion_matrix.filter((F.col('prediction')==1.0) & (F.col('Delay')==1)).first()
tn = df_confusion_matrix.filter((F.col('prediction')==0.0) & (F.col('Delay')==0)).first()
fp = df_confusion_matrix.filter((F.col('prediction')==1.0) & (F.col('Delay')==0)).first()
fn = df_confusion_matrix.filter((F.col('prediction')==0.0) & (F.col('Delay')==1)).first()

confmat = {'TP': 0.0, 'TN': 0.0, 'FP': 0.0, 'FN': 0.0}
if (tp):
    confmat['TP'] = tp['count'] * 1.0
if (tn):
    confmat['TN'] = tn['count'] * 1.0
if (fp):
    confmat['FP'] = fp['count'] * 1.0
if (fn):
    confmat['FN'] = fn['count'] * 1.0

confmat

{'TP': 61.0, 'TN': 2415752.0, 'FP': 11.0, 'FN': 581636.0}

In [28]:
# Based on the confusion matrix, computed the evaluation matrics:
#   accuracy, precision, recall, specifity and F1 score

# PS: Check divisons by 0.0
accuracy = (confmat['TP'] + confmat['TN']) / (confmat['TP'] + confmat['TN'] + confmat['FP'] + confmat['FN'])
precision = confmat['TP'] / (confmat['TP'] + confmat['FP'])
recall = confmat['TP'] / (confmat['TP'] + confmat['FN'])
specificity = confmat['TN'] / (confmat['TN'] + confmat['FP'])
f1score = 2 * (precision * recall) / (precision + recall)


print('Evaluation metrics based on the confusion matrix:')
print(f' Accuracy = {accuracy}')
print(f' Precision = {precision}')
print(f' Recall = {recall}')
print(f' Specifity = {specificity}')
print(f' F1 score = {f1score}')

Evaluation metrics based on the confusion matrix:
 Accuracy = 0.8059533738565319
 Precision = 0.8472222222222222
 Recall = 0.00010486559153648721
 Specifity = 0.9999954465731945
 F1 score = 0.00020970522664493988
