In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
original_data = pd.read_csv("Airline_Delay_Cause 3.csv")

In [3]:
dataset_mlr = original_data.copy()
df_clean = dataset_mlr.dropna()

# Step 1: Calculate derived metrics
df_clean['delay_ratio'] = df_clean['arr_del15'] / df_clean['arr_flights']  # Proportion of delayed flights
df_clean['avg_delay'] = df_clean['arr_delay'] / df_clean['arr_flights']  # Average delay per flight
df_clean['cancel_ratio'] = df_clean['arr_cancelled'] / df_clean['arr_flights']  # Cancellation rate
df_clean['divert_ratio'] = df_clean['arr_diverted'] / df_clean['arr_flights']  # Diversion rate

# Step 2: Define reliability levels
def classify_reliability(row):
    if row['delay_ratio'] > 0.5 or row['avg_delay'] > 30 or row['cancel_ratio'] > 0.1 or row['divert_ratio'] > 0.05:
        return 'Low'
    elif 0.2 <= row['delay_ratio'] <= 0.5 or 10 <= row['avg_delay'] <= 30:
        return 'Medium'
    else:
        return 'High'

df_clean['reliability'] = df_clean.apply(classify_reliability, axis=1)

# Step 3: Encode ordinal classes
reliability_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
df_clean['reliability_ordinal'] = df_clean['reliability'].map(reliability_mapping)

# Step 4: Print class distribution
print(df_clean['reliability'].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['delay_ratio'] = df_clean['arr_del15'] / df_clean['arr_flights']  # Proportion of delayed flights
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['avg_delay'] = df_clean['arr_delay'] / df_clean['arr_flights']  # Average delay per flight
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


reliability
Medium    78261
High      77759
Low       15203
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['reliability'] = df_clean.apply(classify_reliability, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['reliability_ordinal'] = df_clean['reliability'].map(reliability_mapping)


In [4]:
# Calculate derived metrics
df_clean['delay_ratio'] = df_clean['arr_del15'] / df_clean['arr_flights']  # Proportion of delayed flights
df_clean['avg_delay'] = df_clean['arr_delay'] / df_clean['arr_flights']  # Average delay per flight
df_clean['cancel_ratio'] = df_clean['arr_cancelled'] / df_clean['arr_flights']  # Cancellation rate
df_clean['divert_ratio'] = df_clean['arr_diverted'] / df_clean['arr_flights']  # Diversion rate

# Calculate delay contributions
total_delay = df_clean[['carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct']].sum(axis=1)
df_clean['carrier_delay_ratio'] = df_clean['carrier_ct'] / total_delay
df_clean['weather_delay_ratio'] = df_clean['weather_ct'] / total_delay
df_clean['nas_delay_ratio'] = df_clean['nas_ct'] / total_delay
df_clean['security_delay_ratio'] = df_clean['security_ct'] / total_delay
df_clean['late_aircraft_delay_ratio'] = df_clean['late_aircraft_ct'] / total_delay

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['delay_ratio'] = df_clean['arr_del15'] / df_clean['arr_flights']  # Proportion of delayed flights
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['avg_delay'] = df_clean['arr_delay'] / df_clean['arr_flights']  # Average delay per flight
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
# Step 3: Create a composite reliability score
# Assign weights to each factor (adjust weights based on domain knowledge)
weights = {
    'delay_ratio': 0.75,
    'avg_delay': 0.2,
    'cancel_ratio': 0.25,
    'divert_ratio': 0.1,
    'carrier_delay_ratio': 0.50,
    'weather_delay_ratio': 0.30,
    'nas_delay_ratio': 0.05,
    'security_delay_ratio': 0.1,
    'late_aircraft_delay_ratio': 0.1
}

df_clean['reliability_score'] = (
    weights['delay_ratio'] * df_clean['delay_ratio'] +
    weights['avg_delay'] * (df_clean['avg_delay'] / df_clean['avg_delay'].max()) +  # Normalize avg_delay
    weights['cancel_ratio'] * df_clean['cancel_ratio'] +
    weights['divert_ratio'] * df_clean['divert_ratio'] +
    weights['carrier_delay_ratio'] * df_clean['carrier_delay_ratio'] +
    weights['weather_delay_ratio'] * df_clean['weather_delay_ratio'] +
    weights['nas_delay_ratio'] * df_clean['nas_delay_ratio'] +
    weights['security_delay_ratio'] * df_clean['security_delay_ratio'] +
    weights['late_aircraft_delay_ratio'] * df_clean['late_aircraft_delay_ratio']
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['reliability_score'] = (


In [6]:
# Step 4: Define reliability levels based on the composite score
def classify_reliability(row):
    if row['reliability_score'] > 0.4:  # Adjust thresholds as needed
        return 'Low'
    elif 0.2 <= row['reliability_score'] <= 0.4:
        return 'Medium'
    else:
        return 'High'

df_clean['reliability'] = df_clean.apply(classify_reliability, axis=1)

# Step 5: Encode ordinal classes
reliability_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
df_clean['reliability_ordinal'] = df_clean['reliability'].map(reliability_mapping)

# Step 6: Store data for dashboard
# Include airport, month, airline, and reliability score
dashboard_data = df_clean.copy()
dashboard_data.to_csv('dashboard_data_final.csv', index=False)

# Step 7: Print class distribution
print(df_clean['reliability'].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['reliability'] = df_clean.apply(classify_reliability, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['reliability_ordinal'] = df_clean['reliability'].map(reliability_mapping)


reliability
Medium    81043
Low       78717
High      11463
Name: count, dtype: int64


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import pandas as pd

# Step 1: Define target variable (e.g., reliability_ordinal)
y = df_clean['reliability_ordinal']

# Step 2: Select independent variables (include derived metrics and categorical variables)
X = df_clean[[
    'month', 'carrier_name', 'airport',  # Categorical variables
    'delay_ratio', 'avg_delay', 'cancel_ratio', 'divert_ratio',  # Derived metrics
    'carrier_delay_ratio', 'weather_delay_ratio', 'nas_delay_ratio',
    'security_delay_ratio', 'late_aircraft_delay_ratio'  # Delay contributions
]]

# Step 3: Preprocess categorical variables (e.g., one-hot encoding)
X = pd.get_dummies(X, columns=['month', 'carrier_name', 'airport'], drop_first=True)

# Step 4: Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or 'most_frequent'
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.960227770477442

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.96     15819
           1       0.95      0.97      0.96     16111
           2       1.00      0.90      0.95      2315

    accuracy                           0.96     34245
   macro avg       0.97      0.94      0.96     34245
weighted avg       0.96      0.96      0.96     34245



In [24]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, Markdown
import calendar

# STEP 1: Clean and prepare month column
df_clean = df_clean.copy()

# Safely convert month numbers to names only if not already named
if df_clean["month"].dtype != "object" or df_clean["month"].str[0].str.isnumeric().all():
    df_clean["month"] = df_clean["month"].astype(int).map(lambda x: calendar.month_name[x])

# STEP 2: Set up label mapping for prediction output
ordinal_map = {0: "Low", 1: "Medium", 2: "High"}

# STEP 3: Create dropdowns
airport_dd = widgets.Dropdown(
    options=sorted(df_clean["airport"].unique()),
    description="Airport:"
)

carrier_dd = widgets.Dropdown(
    options=sorted(df_clean["carrier_name"].unique()),
    description="Carrier:"
)

month_dd = widgets.Dropdown(
    options=sorted(
        df_clean["month"].unique(),
        key=lambda x: list(calendar.month_name).index(x) if x in calendar.month_name else 0
    ),
    description="Month:"
)

output = widgets.Output()

# STEP 4: Define prediction function
def predict_reliability_from_model(*args):
    output.clear_output()
    with output:
        # Try to get actual row
        real_row = df_clean[
            (df_clean["airport"] == airport_dd.value) &
            (df_clean["carrier_name"] == carrier_dd.value) &
            (df_clean["month"] == month_dd.value)
        ]

        if not real_row.empty:
            row = real_row.iloc[0:1].copy()
            display(Markdown(f"ℹ️ Using actual data for selected combination."))
        else:
            # Build input row using mean metrics
            base_numeric = df_clean[[
                'delay_ratio', 'avg_delay', 'cancel_ratio', 'divert_ratio',
                'carrier_delay_ratio', 'weather_delay_ratio', 'nas_delay_ratio',
                'security_delay_ratio', 'late_aircraft_delay_ratio'
            ]].mean()
            row = pd.DataFrame([base_numeric])
            row["month"] = month_dd.value
            row["carrier_name"] = carrier_dd.value
            row["airport"] = airport_dd.value
            display(Markdown("⚠️ No exact row found — using averaged numerical data."))

        # One-hot encode input
        row_encoded = pd.get_dummies(row, columns=["month", "carrier_name", "airport"], drop_first=True)

        # Add missing columns
        missing_cols = list(set(X.columns) - set(row_encoded.columns))
        missing_df = pd.DataFrame(0, index=row_encoded.index, columns=missing_cols)
        row_encoded = pd.concat([row_encoded, missing_df], axis=1)
        row_encoded = row_encoded[X.columns]

        # Impute and predict
        row_encoded = pd.DataFrame(imputer.transform(row_encoded), columns=X.columns)
        pred_class = model.predict(row_encoded)[0]
        pred_label = ordinal_map[pred_class]

        # Predict confidence
        proba = model.predict_proba(row_encoded)[0]
        proba_fmt = f"""
- **Low:** {proba[2]:.2%}
- **Medium:** {proba[1]:.2%}
- **High:** {proba[0]:.2%}
        """

        # Display results
        display(Markdown(f"### ✈️ Prediction using Random Forest"))
        display(Markdown(f"**Predicted Reliability Level:** `{pred_label}`"))
        display(Markdown("### 🔍 Confidence Levels:"))
        display(Markdown(proba_fmt))

# STEP 5: Attach interaction and show dashboard
for dropdown in [airport_dd, carrier_dd, month_dd]:
    dropdown.observe(predict_reliability_from_model, names='value')

display(widgets.VBox([airport_dd, carrier_dd, month_dd]), output)
predict_reliability_from_model()

VBox(children=(Dropdown(description='Airport:', options=('ABE', 'ABI', 'ABQ', 'ABR', 'ABY', 'ACK', 'ACT', 'ACV…

Output()

In [25]:
!pip install voila

Collecting voila
  Downloading voila-0.5.8-py3-none-any.whl.metadata (9.5 kB)
Collecting jupyter-client<9,>=7.4.4 (from voila)
  Downloading jupyter_client-8.6.3-py3-none-any.whl.metadata (8.3 kB)
Collecting jupyter-server<3,>=1.18 (from voila)
  Downloading jupyter_server-2.15.0-py3-none-any.whl.metadata (8.4 kB)
Collecting jupyterlab-server<3,>=2.3.0 (from voila)
  Downloading jupyterlab_server-2.27.3-py3-none-any.whl.metadata (5.9 kB)
Collecting jupyter-events>=0.11.0 (from jupyter-server<3,>=1.18->voila)
  Downloading jupyter_events-0.12.0-py3-none-any.whl.metadata (5.8 kB)
Collecting jupyter-server-terminals>=0.4.4 (from jupyter-server<3,>=1.18->voila)
  Downloading jupyter_server_terminals-0.5.3-py3-none-any.whl.metadata (5.6 kB)
Collecting overrides>=5.0 (from jupyter-server<3,>=1.18->voila)
  Downloading overrides-7.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting json5>=0.9.0 (from jupyterlab-server<3,>=2.3.0->voila)
  Downloading json5-0.12.0-py3-none-any.whl.metadata (36 kB