<a href="https://colab.research.google.com/github/TylerWichman/mgmt467-analytics-portfolio/blob/main/Assignment%202/individual/Unit2_Tyler_BQML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install and Authenticate

In [33]:
!pip install --quiet google-cloud-bigquery bigquery-magics

from google.colab import auth
auth.authenticate_user()
print("‚úÖ Authenticated")

‚úÖ Authenticated


Set Project

In [34]:
PROJECT_ID = "mgmt-labs-unit-two"

from google.cloud import bigquery
client = bigquery.Client(project=PROJECT_ID)
print("‚úÖ Project set: ", PROJECT_ID)

‚úÖ Project set:  mgmt-labs-unit-two


Create schema to store model

In [35]:
%%bigquery --project $PROJECT_ID

CREATE SCHEMA IF NOT EXISTS `modelsAssignmentTwo`
OPTIONS(location="US");

Query is running:   0%|          |

Build table to use for modeling

In [36]:
%%bigquery --project $PROJECT_ID

CREATE OR REPLACE TABLE `mgmt-labs-unit-two.modelsAssignmentTwo.base` AS
WITH temp AS (
  SELECT
    IF(SAFE_CAST(DivAirportLandings AS INT64) > 0, 1, 0) AS diverted,

    SAFE_CAST(Reporting_Airline AS STRING) AS carrier,
    CONCAT(CAST(Origin AS STRING), '-', CAST(Dest AS STRING)) AS route,
    SAFE_CAST(Distance AS FLOAT64) AS distance,
    EXTRACT(DAYOFWEEK FROM FlightDate) AS day_of_week,
    EXTRACT(MONTH FROM FlightDate) AS month,

    SAFE_CAST(DepDelay AS FLOAT64) AS dep_delay_raw,

    CASE
      WHEN DepTime IS NULL THEN NULL
      ELSE CAST(SUBSTR(LPAD(CAST(DepTime AS STRING), 4, '0'), 1, 2) AS INT64)
    END AS hour_of_day

  FROM `mgmt-labs-unit-two.flights_data.flights_raw`
  WHERE Origin IS NOT NULL AND Dest IS NOT NULL
)

SELECT
  *,
  CASE
    WHEN dep_delay_raw IS NULL THEN 'unknown'
    WHEN dep_delay_raw <= -5 THEN 'early'
    WHEN dep_delay_raw <= 5 THEN 'on_time'
    WHEN dep_delay_raw <= 20 THEN 'minor'
    WHEN dep_delay_raw <= 60 THEN 'moderate'
    ELSE 'major'
  END AS dep_delay_bucket
FROM temp;


Query is running:   0%|          |

Model A ‚Äî Pre-departure Logistic Regression

In [37]:
%%bigquery --project $PROJECT_ID
CREATE OR REPLACE MODEL `mgmt-labs-unit-two.modelsAssignmentTwo.model_a_global`
OPTIONS(
  MODEL_TYPE='logistic_reg',
  INPUT_LABEL_COLS=['diverted'],
  DATA_SPLIT_METHOD='AUTO_SPLIT'
) AS
SELECT diverted, carrier, route, distance, day_of_week, month
FROM `mgmt-labs-unit-two.modelsAssignmentTwo.base`;

Query is running:   0%|          |

Evaluate Model A

In [38]:
%%bigquery --project $PROJECT_ID
SELECT * FROM ML.EVALUATE(MODEL `mgmt-labs-unit-two.modelsAssignmentTwo.model_a_global`);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,0.0,0.0,0.991698,0.0,0.047804,0.572394


Model A - Confusion Matrix

In [39]:
%%bigquery --project $PROJECT_ID

WITH pred AS (
  SELECT
    diverted AS actual,
    predicted_diverted_probs[OFFSET(1)].prob AS prob
  FROM ML.PREDICT(
      MODEL `mgmt-labs-unit-two.modelsAssignmentTwo.model_a_global`,
      (SELECT * FROM `mgmt-labs-unit-two.modelsAssignmentTwo.base`)
  )
)
SELECT
  SUM(CASE WHEN prob >= 0.5 AND actual = 1 THEN 1 ELSE 0 END) AS TP,
  SUM(CASE WHEN prob >= 0.5 AND actual = 0 THEN 1 ELSE 0 END) AS FP,
  SUM(CASE WHEN prob <  0.5 AND actual = 1 THEN 1 ELSE 0 END) AS FN,
  SUM(CASE WHEN prob <  0.5 AND actual = 0 THEN 1 ELSE 0 END) AS TN
FROM pred;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,TP,FP,FN,TN
0,11748,1646511,0,0


In [40]:
%%bigquery --project $PROJECT_ID

WITH pred AS (
  SELECT
    diverted AS actual,
    predicted_diverted_probs[OFFSET(1)].prob AS prob
  FROM ML.PREDICT(
      MODEL `mgmt-labs-unit-two.modelsAssignmentTwo.model_a_global`,
      (SELECT * FROM `mgmt-labs-unit-two.modelsAssignmentTwo.base`)
  )
),
binned AS (
  SELECT
    NTILE(10) OVER (ORDER BY prob) AS bucket,
    prob,
    actual
  FROM pred
)
SELECT
  bucket,
  ROUND(AVG(prob), 4) AS avg_predicted_probability,
  ROUND(AVG(actual), 4) AS observed_diversion_rate,
  COUNT(*) AS flights_in_bucket
FROM binned
GROUP BY bucket
ORDER BY bucket;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,bucket,avg_predicted_probability,observed_diversion_rate,flights_in_bucket
0,1,0.9862,0.019,165826
1,2,0.9899,0.0117,165826
2,3,0.9907,0.0095,165826
3,4,0.9913,0.0081,165826
4,5,0.9918,0.007,165826
5,6,0.9922,0.0056,165826
6,7,0.9926,0.0044,165826
7,8,0.993,0.0032,165826
8,9,0.9935,0.0017,165826
9,10,0.9941,0.0006,165825


Calibration: Calibration across probability deciles shows that predictions decrease smoothly from ~0.019% to ~0.0006% diversion likelihood, but the observed diversion rate stays extremely low in every bucket. The model effectively predicts ‚Äúno diversion‚Äù for all flights, resulting in zero true positives and more than 11,000 missed diversion events. This indicates that schedule-only features lack the predictive signal needed to detect diversion risk.

Hand Off: Model A establishes a clean baseline using only pre-departure manifest fields (carrier, route, distance, day of week, month). While interpretable and stable, it fails to identify any true diversions because diversions are driven by operational disruptions (e.g., delays, congestion, weather) that happen after scheduling. The next model (Model B) introduces real-time departure delay buckets to quantify uplift and reduce false negatives, demonstrating the value of operational features over schedule-only information.

In [None]:
%%bigquery --project $PROJECT_ID
SELECT
  column_name,
  data_type
FROM `mgmt-labs-unit-two.flights_data.INFORMATION_SCHEMA.COLUMNS`
WHERE table_name = 'flights_raw';

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,column_name,data_type
0,Year,INT64
1,Quarter,INT64
2,Month,INT64
3,DayofMonth,INT64
4,DayOfWeek,INT64
...,...,...
106,Div5LongestGTime,STRING
107,Div5WheelsOff,STRING
108,Div5TailNum,STRING
109,string_field_109,STRING


In [41]:
from google.colab import drive
drive.mount('/content/drive')

import nbformat
import uuid

# üîß UPDATE THIS to match the path of the notebook you're working on
path = "/content/drive/MyDrive/Colab Notebooks/Unit2_Tyler_BQML.ipynb"

# Load the notebook
nb = nbformat.read(path, as_version=nbformat.NO_CONVERT)

# Fix invalid IDs & remove widget metadata
for cell in nb.cells:
    # generate valid cell IDs
    if not cell.get("id") or not cell["id"].replace("-", "").replace("_", "").isalnum():
        cell["id"] = uuid.uuid4().hex

    # clean widget metadata (prevents "state missing" errors)
    if "metadata" in cell and "widgets" in cell["metadata"]:
        cell["metadata"].pop("widgets", None)

# Save cleaned notebook
nbformat.write(nb, path)
print("‚úÖ Notebook cleaned and fixed for GitHub upload:", path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Notebook cleaned and fixed for GitHub upload: /content/drive/MyDrive/Colab Notebooks/Unit2_Tyler_BQML.ipynb
