---
# Imports

In [1]:
import pyspark 
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import numpy as np
#!pip install kagglehub
#import kagglehub

In [2]:
spark = SparkSession.builder.appName("ProjetoABD").getOrCreate()

In [3]:
#dados = spark.read.parquet('avioes/pq').limit(50000)
dados = spark.read.load('avioes/csv',format='csv',sep=',',inferSchema=True, header=True)#.limit(50000)

In [4]:
dados.printSchema()
dados.show()
dados.count()

root
 |-- FlightDate: date (nullable = true)
 |-- Airline: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Cancelled: boolean (nullable = true)
 |-- Diverted: boolean (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- DepTime: double (nullable = true)
 |-- DepDelayMinutes: double (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- ArrTime: double (nullable = true)
 |-- ArrDelayMinutes: double (nullable = true)
 |-- AirTime: double (nullable = true)
 |-- CRSElapsedTime: double (nullable = true)
 |-- ActualElapsedTime: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Marketing_Airline_Network: string (nullable = true)
 |-- Operated_or_Branded_Code_Share_Partners: string (nullable = true)
 |

29193782

-----
# Data Cleaning

## 1.Removing duplicates if any

In [None]:
# apagar o Year|Quarter|Month|DayofMonth 
# substituir os OriginCityName|OriginState|OriginStateName|OriginStateFips num só
# DestCityName|DestState|DestStateNam|DestStateFips mesmo que em cima
# tirar o OriginCityMarketID que ja temos a cidade 
# tirar OriginAirportID|OriginAirportSeqID que ja temos o nome do aeroporto 
# igual para estes DestAirportID|DestAirportSeqID|DestCityMarketID 
# Marketing_Airline_Network IATA_Code_Marketing_Airline iguais
# Operating_Airline IATA_Code_Operating_Airline iguais
# Flight_Number_Marketing_Airline Flight_Number_Operating_Airline iguais
# apagar DepDelayMinutes e DepDel15 e arranjar o DepDelay | fazer o mesmo para ArrDelay ArrDelayMinutes ArrDel15

In [5]:
cols_to_dismiss = ['DestAirportID','DestAirportSeqID','DestCityMarketID','OriginAirportID','OriginAirportSeqID'
                   ,'OriginCityMarketID','Year','Quarter','Month','DayofMonth','Marketing_Airline_Network'
                   ,'Operating_Airline','Flight_Number_Marketing_Airline','ArrDelayMinutes','DepDelayMinutes'
                   ,'DepDel15','ArrDel15','DepTimeBlk','ArrTimeBlk']
dados = dados.drop(*cols_to_dismiss)

In [6]:
dados = dados.withColumn('OriginCityNameState',
                          F.concat_ws(
                              ',',
                                F.split(F.col('OriginCityName'),',')[0],
                                F.col("OriginStateName")
    ))
#dados.show()

In [7]:
dados = dados.withColumn('DestCityNameState',
                          F.concat_ws(
                              ',',
                                F.split(F.col('DestCityName'),',')[0],
                                F.col("DestStateName")
    ))
#dados.show()

In [8]:
cols_to_dismiss = ["OriginCityName","OriginState","OriginStateName","OriginStateFips","DestCityName","DestState","DestStateName","DestStateFips"]
dados = dados.drop(*cols_to_dismiss)

In [9]:
dados = dados.withColumn("DepDelay",F.col("DepTime") - F.col("CRSDepTime"))

dados = dados.withColumn("ArrDelay",F.col("ArrTime") - F.col("CRSArrTime"))
# dados.select("ArrTime","CRSArrTime","ArrDelay","DepDelay","DepTime","CRSDepTime").show()

In [None]:
# Checking first: use dropDuplicates() and count()
print(f'Dados: number of rows: {dados.count()}, after dropduplicates: {dados.dropDuplicates().count() }')

## 2.Handling missing values

In [None]:
# Ver em que colunas estão os nulos (by columns)
#{col: dados.filter(dados[col].isNull()).count() for col in dados.columns}


# valores corridos ao dataset todo sem fazer nada aos nulls
"""
{'FlightDate': 0,
 'Airline': 0,
 'Origin': 0,
 'Dest': 0,
 'Cancelled': 0,
 'Diverted': 0,
 'CRSDepTime': 0,
 'DepTime': 761652,
 'DepDelayMinutes': 763084,
 'DepDelay': 763084,
 'ArrTime': 786177,
 'ArrDelayMinutes': 846183,
 'AirTime': 852561,
 'CRSElapsedTime': 22,
 'ActualElapsedTime': 845637,
 'Distance': 0,
 'DayOfWeek': 0,
 'Operated_or_Branded_Code_Share_Partners': 0,
 'DOT_ID_Marketing_Airline': 0,
 'IATA_Code_Marketing_Airline': 0,
 'DOT_ID_Operating_Airline': 0,
 'IATA_Code_Operating_Airline': 0,
 'Tail_Number': 267613,
 'Flight_Number_Operating_Airline': 0,
 'OriginWac': 0,
 'DestStateName': 0,
 'DestWac': 0,
 'DepDel15': 763084,
 'DepartureDelayGroups': 763084,
 'DepTimeBlk': 0,
 'TaxiOut': 780561,
 'WheelsOff': 780551,
 'WheelsOn': 793133,
 'TaxiIn': 793143,
 'CRSArrTime': 0,
 'ArrDelay': 846183,
 'ArrDel15': 846183,
 'ArrivalDelayGroups': 846183,
 'ArrTimeBlk': 0,
 'DistanceGroup': 0,
 'DivAirportLandings': 90,
 'OriginCityNameState': 0,
 'DestCityNameState': 0}"""


In [10]:
# remover nulls
dados = dados.dropna(how='any')

In [11]:
dados.printSchema()
dados.show()
dados.count()

root
 |-- FlightDate: date (nullable = true)
 |-- Airline: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Cancelled: boolean (nullable = true)
 |-- Diverted: boolean (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- DepTime: double (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- ArrTime: double (nullable = true)
 |-- AirTime: double (nullable = true)
 |-- CRSElapsedTime: double (nullable = true)
 |-- ActualElapsedTime: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Operated_or_Branded_Code_Share_Partners: string (nullable = true)
 |-- DOT_ID_Marketing_Airline: integer (nullable = true)
 |-- IATA_Code_Marketing_Airline: string (nullable = true)
 |-- DOT_ID_Operating_Airline: integer (nullable = true)
 |-- IATA_Code_Operating_Airline: string (nullable = true)
 |-- Tail_Number: string (nullable = true)
 |-- Flight_Number_Operating_Airline

28339510

-----
# Data transformation