In [0]:
# To access Kaggle datasets, you must authenticate using a Kaggle API token.
# Download your 'kaggle.json' from your Kaggle account settings and provide its path below.
kaggle_json_path = "kaggle.json"


dataset_name = "Flight Delay and Cancellation Dataset (2019-2023)"
dataset_url = "https://www.kaggle.com/api/v1/datasets/download/patrickzel/flight-delay-and-cancellation-dataset-2019-2023"
download_dir = f"/tmp" # in regular Databricks env we could write to our workspace or other DFS path

In [0]:
# Load Kaggle API credentials from the provided JSON file
import json
with open(kaggle_json_path, 'r') as f:
    kaggle_creds = json.load(f)
KAGGLE_USERNAME = kaggle_creds['username']
KAGGLE_KEY = kaggle_creds['key']

In [0]:
# download the dataset using the Kaggle API
import requests
import os

os.makedirs(download_dir, exist_ok=True)

session = requests.Session()
session.auth = (KAGGLE_USERNAME, KAGGLE_KEY)
response = session.get(dataset_url, stream=True)
if response.status_code != 200:
    raise Exception(f"Failed to download file, status code: {response.status_code}")

zip_path = os.path.join(download_dir, dataset_name + ".zip")
with open(zip_path, "wb") as f:
    for chunk in response.iter_content(chunk_size=10485760):
        if chunk:
            f.write(chunk)

print(f"File downloaded to {zip_path}")

import zipfile
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(download_dir)

print(f"Files extracted to {download_dir}")

In [0]:
flights_name = "flights_sample_3m.csv"
flights_path = os.path.join(download_dir, flights_name)

import pandas as pd
import os

df_flights = pd.read_csv(flights_path)
print("Shape of the df: ", df_flights.shape)

In [0]:
df_flights.display()

In [0]:
# Use a Unity Catalog catalog and schema, e.g., "main.bronze"
flights_permanent_name = "flights_permanent"

# Drop table if exists (in Unity Catalog)
spark.sql(
    f"DROP TABLE IF EXISTS {flights_permanent_name}"
)
spark_df_flights = spark.createDataFrame(df_flights)
# Save as permanent table (Delta format by default)
spark_df_flights.write.format("delta").mode("overwrite").saveAsTable(
    flights_permanent_name
)

In [0]:
%sql
SELECT * FROM flights_permanent LIMIT 10

In [0]:
%sql
CREATE OR REPLACE TABLE flights_clean AS
WITH t AS (
  SELECT
    *,
    -- zamieniamy np. 5, 945, 1530 na 0005, 0945, 1530
    LPAD(CAST(CRS_DEP_TIME AS STRING), 4, '0') AS crs_dep_str,
    LPAD(CAST(DEP_TIME     AS STRING), 4, '0') AS dep_str,
    LPAD(CAST(CRS_ARR_TIME AS STRING), 4, '0') AS crs_arr_str,
    LPAD(CAST(ARR_TIME     AS STRING), 4, '0') AS arr_str
  FROM flights_permanent
)
SELECT
  -- data lotu
  TO_DATE(FL_DATE, 'yyyy-MM-dd') AS fl_date,

  -- podstawowe informacje
  AIRLINE_CODE,
  AIRLINE,
  ORIGIN,
  ORIGIN_CITY,
  DEST,
  DEST_CITY,
  DEP_DELAY,
  ARR_DELAY,
  CANCELLED,
  CANCELLATION_CODE,

  -- timestampy (błędy zamieniane na NULL dzięki try_to_timestamp)
  try_to_timestamp(
    CONCAT(FL_DATE, ' ',
           SUBSTRING(crs_dep_str, 1, 2), ':',
           SUBSTRING(crs_dep_str, 3, 2)),
    'yyyy-MM-dd HH:mm'
  ) AS crs_dep_ts,

  try_to_timestamp(
    CONCAT(FL_DATE, ' ',
           SUBSTRING(dep_str, 1, 2), ':',
           SUBSTRING(dep_str, 3, 2)),
    'yyyy-MM-dd HH:mm'
  ) AS dep_ts,

  try_to_timestamp(
    CONCAT(FL_DATE, ' ',
           SUBSTRING(crs_arr_str, 1, 2), ':',
           SUBSTRING(crs_arr_str, 3, 2)),
    'yyyy-MM-dd HH:mm'
  ) AS crs_arr_ts,

  try_to_timestamp(
    CONCAT(FL_DATE, ' ',
           SUBSTRING(arr_str, 1, 2), ':',
           SUBSTRING(arr_str, 3, 2)),
    'yyyy-MM-dd HH:mm'
  ) AS arr_ts,

  -- proste flagi
  CASE WHEN DEP_DELAY > 15 THEN 1 ELSE 0 END AS is_dep_delayed_15m,
  CASE WHEN ARR_DELAY > 15 THEN 1 ELSE 0 END AS is_arr_delayed_15m,
  CASE WHEN CANCELLED = 1 THEN 1 ELSE 0 END AS is_cancelled

FROM t;


In [0]:
%sql
CREATE OR REPLACE TABLE silver_flights AS
WITH formatted_strings AS (
  SELECT
    *,
    -- Standaryzacja godzin do formatu 4-cyfrowego (np. '5' -> '0005')
    LPAD(CAST(CRS_DEP_TIME AS STRING), 4, '0') AS crs_dep_str,
    LPAD(CAST(DEP_TIME     AS STRING), 4, '0') AS dep_str,
    LPAD(CAST(CRS_ARR_TIME AS STRING), 4, '0') AS crs_arr_str,
    LPAD(CAST(ARR_TIME     AS STRING), 4, '0') AS arr_str
  FROM flights_permanent
)
SELECT
  -- 1. Klucze i Daty
  f.FL_DATE,
  TO_DATE(f.FL_DATE, 'yyyy-MM-dd') AS fl_date_iso,
  
  -- 2. Linie lotnicze
  f.AIRLINE_CODE,
  f.AIRLINE AS airline_name, 

  -- 3. Lotnisko Wylotu (ORIGIN) - JOIN z airports_data
  f.ORIGIN AS origin_code,
  COALESCE(ap_org.name, f.ORIGIN_CITY) AS origin_airport_name,
  f.ORIGIN_CITY AS origin_city,
  ap_org.lat AS origin_lat,
  ap_org.lon AS origin_lon,

  -- 4. Lotnisko Przylotu (DEST) - JOIN z airports_data
  f.DEST AS dest_code,
  COALESCE(ap_dst.name, f.DEST_CITY) AS dest_airport_name,
  f.DEST_CITY AS dest_city,
  ap_dst.lat AS dest_lat,
  ap_dst.lon AS dest_lon,

  -- 5. Timestampy (Konwersja stringów na timestamp)
  try_to_timestamp(CONCAT(f.FL_DATE, ' ', SUBSTRING(f.crs_dep_str, 1, 2), ':', SUBSTRING(f.crs_dep_str, 3, 2)), 'yyyy-MM-dd HH:mm') AS crs_dep_ts,
  try_to_timestamp(CONCAT(f.FL_DATE, ' ', SUBSTRING(f.dep_str, 1, 2), ':', SUBSTRING(f.dep_str, 3, 2)), 'yyyy-MM-dd HH:mm') AS dep_ts,
  try_to_timestamp(CONCAT(f.FL_DATE, ' ', SUBSTRING(f.crs_arr_str, 1, 2), ':', SUBSTRING(f.crs_arr_str, 3, 2)), 'yyyy-MM-dd HH:mm') AS crs_arr_ts,
  try_to_timestamp(CONCAT(f.FL_DATE, ' ', SUBSTRING(f.arr_str, 1, 2), ':', SUBSTRING(f.arr_str, 3, 2)), 'yyyy-MM-dd HH:mm') AS arr_ts,

  -- 6. Metryki i Flagi (0/1)
  f.DEP_DELAY,
  f.ARR_DELAY,
  f.CANCELLED,
  f.CANCELLATION_CODE,
  CASE WHEN f.DEP_DELAY > 15 THEN 1 ELSE 0 END AS is_dep_delayed_15m,
  CASE WHEN f.ARR_DELAY > 15 THEN 1 ELSE 0 END AS is_arr_delayed_15m,
  CASE WHEN f.CANCELLED = 1 THEN 1 ELSE 0 END AS is_cancelled

FROM formatted_strings f
-- JOIN po kodzie IATA (kluczowe dla poprawnych nazw lotnisk)
LEFT JOIN airports_data ap_org ON f.ORIGIN = ap_org.iata
LEFT JOIN airports_data ap_dst ON f.DEST = ap_dst.iata;

In [0]:
%sql
CREATE OR REPLACE TABLE gold_airport_stats AS
SELECT
  origin_code AS airport_code,
  origin_airport_name,
  origin_city,
  -- Współrzędne są niezbędne do mapy w Power BI
  origin_lat,
  origin_lon,
  
  COUNT(*) AS total_flights,
  SUM(is_dep_delayed_15m) AS delayed_count,
  SUM(is_cancelled) AS cancelled_count,
  
  -- Wskaźnik "Problem Rate" (Opóźnienia + Odwołania) do sortowania rankingu
  ROUND((SUM(is_dep_delayed_15m) + SUM(is_cancelled)) / COUNT(*), 4) AS problem_rate,
  
  -- Procentowe wskaźniki
  ROUND(AVG(is_dep_delayed_15m), 4) AS delay_pct,
  ROUND(AVG(is_cancelled), 4) AS cancel_pct

FROM silver_flights
GROUP BY 1, 2, 3, 4, 5;

In [0]:
%sql
CREATE OR REPLACE TABLE gold_airline_stats AS
SELECT
  AIRLINE_CODE,
  airline_name,
  
  COUNT(*) AS total_flights,
  AVG(ARR_DELAY) AS avg_arr_delay_min,
  
  -- Dla linii ważniejsze jest opóźnienie przylotu (doświadczenie pasażera)
  ROUND(AVG(is_arr_delayed_15m), 4) AS arrival_delay_pct,
  ROUND(AVG(is_cancelled), 4) AS cancel_pct

FROM silver_flights
GROUP BY 1, 2;

In [0]:
%sql
CREATE OR REPLACE TABLE gold_route_risk AS
SELECT
  origin_code,
  origin_city,
  dest_code,
  dest_city,
  
  COUNT(*) AS total_flights,
  ROUND(AVG(is_dep_delayed_15m), 4) AS delay_pct,
  
  -- Logika biznesowa modelu ryzyka (zapisana w tabeli na stałe)
  CASE 
    WHEN AVG(is_dep_delayed_15m) >= 0.30 THEN 'High Risk'
    WHEN AVG(is_dep_delayed_15m) >= 0.15 THEN 'Medium Risk'
    ELSE 'Low Risk'
  END AS risk_category,
  
  -- Gotowa etykieta do wykresów (np. "JFK -> LHR")
  CONCAT(origin_code, ' -> ', dest_code) AS route_label

FROM silver_flights
GROUP BY 1, 2, 3, 4
HAVING count(*) > 10; -- Odsiewamy trasy o znikomej liczbie lotów