# Data Preprocessing #
In the next section, we will perform data cleaning techniques, such as removing rows with missing values, removing duplicates and removing extreme outliers.

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Data Preprocessing + Delay Pipeline")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
delay_path = 'gs://de_as2_data/flights_delaydata.csv' 
time_path = 'gs://de_as2_data/flights_timedata.csv' 
info_path = 'gs://de_as2_data/flights_infodata.csv'
# Create data frame
df_delay = spark.read.format("csv") \
       .option("header", "true") \
       .option("inferSchema", "true") \
       .load(delay_path)
df_delay.printSchema()

df_time = spark.read.format("csv") \
       .option("header", "true") \
       .option("inferSchema", "true") \
       .load(time_path)
df_time.printSchema()

df_info = spark.read.format("csv") \
       .option("header", "true") \
       .option("inferSchema", "true") \
       .load(info_path)
df_info.printSchema()

# Inferschema makes all data types adjust dynamically
# APPOINT WORKER TO EACH PIPELINE

root
 |-- id: integer (nullable = true)
 |-- dep_time: integer (nullable = true)
 |-- sched_dep_time: integer (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_time: integer (nullable = true)
 |-- sched_arr_time: integer (nullable = true)
 |-- arr_delay: integer (nullable = true)

root
 |-- id: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- time_hour: string (nullable = true)

root
 |-- id: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- distance: integer (nullable = true)
 |-- name: string (nullable = true)



In [2]:
# Show original data count
print("Original row count flights_delaydata:", df_delay.count())
print("Original row count flights_timedata:", df_time.count())
print("Original row count flights_infodata:", df_info.count())

Original row count flights_delaydata: 336776
Original row count flights_timedata: 336776
Original row count flights_infodata: 336776


We drop rows with missing values to ensure consistency.

In [3]:
# Remove rows with any missing values
df_delay = df_delay.dropna()
df_time = df_time.dropna()
df_info = df_info.dropna()

Next, we remove rows containing duplicate information.

In [4]:
# Remove rows containing duplicate information
df_delay = df_delay.dropDuplicates()
df_time = df_time.dropDuplicates()
df_info = df_info.dropDuplicates()

Lastly, we remove extreme outliers using the Interquartile Range (IQR) statistical method, as it is robust at detecting outliers. This is only neccessary for the attribute 'arr_delay' in flights_delaydata.csv

In [5]:
from pyspark.sql.functions import col

# Remove extreme outliers in arr_delay column in delaydata
column = "arr_delay"
q1, q3 = df_delay.approxQuantile(column, [0.25, 0.75], 0.01)
iqr = q3 - q1

# Calculate lower- and upper bound respectively
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

df_delay = df_delay.filter((col(column) >= lower_bound) & (col(column) <= upper_bound))

In [6]:
# Show cleaned data count
print("Cleaned row count flights_delaydata:", df_delay.count())
print("Cleaned row count flights_timedata:", df_time.count())
print("Cleaned row count flights_infodata:", df_info.count())

Cleaned row count flights_delaydata: 299466
Cleaned row count flights_timedata: 327346
Cleaned row count flights_infodata: 334264


# New Variable Creation: delay_bin #

Create new categorical variable 'arr_delay_category', where if arr_delay is < 0 then 0 (early), if arr_delay = 0 (on time) then 1 and if arr_delay > 0 (late) then 2

In [7]:
from pyspark.sql.functions import col, when

# Create a new column 'delay_bin' with bins based on arr_delay
df_delay_new = df_delay.withColumn(
    "delay_bin",
    when(col("arr_delay") < 0, "Early")
    .when((col("arr_delay") >= 0) & (col("arr_delay") <= 15), "On Time")
    .when((col("arr_delay") > 15) & (col("arr_delay") <= 60), "Moderate Delay")
    .otherwise("Severe Delay")  # arr_delay > 60
)

# Show the updated DataFrame
df_delay_new.select("arr_delay", "delay_bin").show(50)

+---------+--------------+
|arr_delay|     delay_bin|
+---------+--------------+
|      -28|         Early|
|      -13|         Early|
|      -11|         Early|
|       -6|         Early|
|       43|Moderate Delay|
|      -10|         Early|
|      -15|         Early|
|        3|       On Time|
|      -10|         Early|
|       -4|         Early|
|       22|Moderate Delay|
|      -20|         Early|
|      -53|         Early|
|      -17|         Early|
|      -10|         Early|
|       -6|         Early|
|      -30|         Early|
|       27|Moderate Delay|
|       31|Moderate Delay|
|      -17|         Early|
|       12|       On Time|
|       11|       On Time|
|      -24|         Early|
|       47|Moderate Delay|
|        1|       On Time|
|       21|Moderate Delay|
|        6|       On Time|
|      -14|         Early|
|      -22|         Early|
|      -32|         Early|
|       -2|         Early|
|      -36|         Early|
|      -16|         Early|
|        8|       On Time|
|

# Join Dataframes #

In [9]:
# Assuming df1, df2, and df3 are your three DataFrames
df1 = df_delay_new
df2 = df_info
df3 = df_time

joined_df_delay = df1.join(df2, on="id", how="inner") \
                    .join(df3, on="id", how="inner")



In [10]:
joined_df_delay.show(10)

+---+--------+--------------+---------+--------+--------------+---------+--------------+-------+------+-------+------+----+--------+--------------------+----+-----+---+--------+----+------+----------------+
| id|dep_time|sched_dep_time|dep_delay|arr_time|sched_arr_time|arr_delay|     delay_bin|carrier|flight|tailnum|origin|dest|distance|                name|year|month|day|air_time|hour|minute|       time_hour|
+---+--------+--------------+---------+--------+--------------+---------+--------------+-------+------+-------+------+----+--------+--------------------+----+-----+---+--------+----+------+----------------+
|  1|     533|           529|        4|     850|           830|       20|Moderate Delay|     UA|  1714| N24211|   LGA| IAH|    1416|United Air Lines ...|2013|    1|  1|     227|   5|    29|01/01/2013 05:00|
|  3|     544|           545|       -1|    1004|          1022|      -18|         Early|     B6|   725| N804JB|   JFK| BQN|    1576|     JetBlue Airways|2013|    1|  1|    

# Upload new Dataframe to bucket #

In [11]:
joined_df_delay.write.csv("gs://de_as2_data/joined_flights_delaydata_final", header=True)

AnalysisException: [PATH_ALREADY_EXISTS] Path gs://de_as2_data/joined_flights_delaydata_final already exists. Set mode as "overwrite" to overwrite the existing path.

# Upload to BigQuery #

In [12]:
import sys
!{sys.executable} -m pip install --upgrade google-cloud-bigquery

Collecting google-cloud-bigquery
  Downloading google_cloud_bigquery-3.27.0-py2.py3-none-any.whl.metadata (8.6 kB)
Collecting google-api-core<3.0.0dev,>=2.11.1 (from google-api-core[grpc]<3.0.0dev,>=2.11.1->google-cloud-bigquery)
  Downloading google_api_core-2.23.0-py3-none-any.whl.metadata (3.0 kB)
Collecting google-auth<3.0.0dev,>=2.14.1 (from google-cloud-bigquery)
  Downloading google_auth-2.36.0-py2.py3-none-any.whl.metadata (4.7 kB)
Collecting google-cloud-core<3.0.0dev,>=2.4.1 (from google-cloud-bigquery)
  Downloading google_cloud_core-2.4.1-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting google-resumable-media<3.0dev,>=2.0.0 (from google-cloud-bigquery)
  Downloading google_resumable_media-2.7.2-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting googleapis-common-protos<2.0.dev0,>=1.56.2 (from google-api-core<3.0.0dev,>=2.11.1->google-api-core[grpc]<3.0.0dev,>=2.11.1->google-cloud-bigquery)
  Downloading googleapis_common_protos-1.66.0-py2.py3-none-any.whl.metadata (1.5 kB)


In [13]:
from google.cloud import bigquery

# upload to BigQuery
client = bigquery.Client(project="chromatic-pride-435508-i5")   

dataset_id = "chromatic-pride-435508-i5.flights"
table_id = f"{dataset_id}.Delay_Pipeline"

job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.CSV
job_config.skip_leading_rows = 1 # ignore the header
job_config.autodetect = True


gcs_path = "gs://de_as2_data/joined_flights_delaydata_final/*.csv"  # *.csv combines all files to result in a whole dataframe

load_job = client.load_table_from_uri(gcs_path, table_id, job_config=job_config)

# Wait for the job to complete
load_job.result()
print("Data successfully uploaded to BigQuery.")

Data successfully uploaded to BigQuery.


### Stop spark session ###

In [15]:
spark.stop()