In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [1]:
original_data = pd.read_csv("airline_delay_data.csv")

NameError: name 'pd' is not defined

In [4]:
dataset_mlr = original_data.copy()
df_clean = dataset_mlr.dropna()

# Step 1: Calculate derived metrics
df_clean['delay_ratio'] = df_clean['arr_del15'] / df_clean['arr_flights']  # Proportion of delayed flights
df_clean['avg_delay'] = df_clean['arr_delay'] / df_clean['arr_flights']  # Average delay per flight
df_clean['cancel_ratio'] = df_clean['arr_cancelled'] / df_clean['arr_flights']  # Cancellation rate
df_clean['divert_ratio'] = df_clean['arr_diverted'] / df_clean['arr_flights']  # Diversion rate

# Step 2: Define reliability levels
def classify_reliability(row):
    if row['delay_ratio'] > 0.5 or row['avg_delay'] > 30 or row['cancel_ratio'] > 0.1 or row['divert_ratio'] > 0.05:
        return 'Low'
    elif 0.2 <= row['delay_ratio'] <= 0.5 or 10 <= row['avg_delay'] <= 30:
        return 'Medium'
    else:
        return 'High'

df_clean['reliability'] = df_clean.apply(classify_reliability, axis=1)

# Step 3: Encode ordinal classes
reliability_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
df_clean['reliability_ordinal'] = df_clean['reliability'].map(reliability_mapping)

# Step 4: Print class distribution
print(df_clean['reliability'].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['delay_ratio'] = df_clean['arr_del15'] / df_clean['arr_flights']  # Proportion of delayed flights
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['avg_delay'] = df_clean['arr_delay'] / df_clean['arr_flights']  # Average delay per flight
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


reliability
High      53753
Medium    53286
Low       12375
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['reliability'] = df_clean.apply(classify_reliability, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['reliability_ordinal'] = df_clean['reliability'].map(reliability_mapping)


In [5]:
# Calculate derived metrics
df_clean['delay_ratio'] = df_clean['arr_del15'] / df_clean['arr_flights']  # Proportion of delayed flights
df_clean['avg_delay'] = df_clean['arr_delay'] / df_clean['arr_flights']  # Average delay per flight
df_clean['cancel_ratio'] = df_clean['arr_cancelled'] / df_clean['arr_flights']  # Cancellation rate
df_clean['divert_ratio'] = df_clean['arr_diverted'] / df_clean['arr_flights']  # Diversion rate

# Calculate delay contributions
total_delay = df_clean[['carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct']].sum(axis=1)
df_clean['carrier_delay_ratio'] = df_clean['carrier_ct'] / total_delay
df_clean['weather_delay_ratio'] = df_clean['weather_ct'] / total_delay
df_clean['nas_delay_ratio'] = df_clean['nas_ct'] / total_delay
df_clean['security_delay_ratio'] = df_clean['security_ct'] / total_delay
df_clean['late_aircraft_delay_ratio'] = df_clean['late_aircraft_ct'] / total_delay

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['delay_ratio'] = df_clean['arr_del15'] / df_clean['arr_flights']  # Proportion of delayed flights
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['avg_delay'] = df_clean['arr_delay'] / df_clean['arr_flights']  # Average delay per flight
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
# Step 3: Create a composite reliability score
# Assign weights to each factor (adjust weights based on domain knowledge)
weights = {
    'delay_ratio': 0.75,
    'avg_delay': 0.2,
    'cancel_ratio': 0.25,
    'divert_ratio': 0.1,
    'carrier_delay_ratio': 0.50,
    'weather_delay_ratio': 0.30,
    'nas_delay_ratio': 0.05,
    'security_delay_ratio': 0.1,
    'late_aircraft_delay_ratio': 0.1
}

df_clean['reliability_score'] = (
    weights['delay_ratio'] * df_clean['delay_ratio'] +
    weights['avg_delay'] * (df_clean['avg_delay'] / df_clean['avg_delay'].max()) +  # Normalize avg_delay
    weights['cancel_ratio'] * df_clean['cancel_ratio'] +
    weights['divert_ratio'] * df_clean['divert_ratio'] +
    weights['carrier_delay_ratio'] * df_clean['carrier_delay_ratio'] +
    weights['weather_delay_ratio'] * df_clean['weather_delay_ratio'] +
    weights['nas_delay_ratio'] * df_clean['nas_delay_ratio'] +
    weights['security_delay_ratio'] * df_clean['security_delay_ratio'] +
    weights['late_aircraft_delay_ratio'] * df_clean['late_aircraft_delay_ratio']
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['reliability_score'] = (


In [8]:
# Step 4: Define reliability levels based on the composite score
def classify_reliability(row):
    if row['reliability_score'] > 0.4:  # Adjust thresholds as needed
        return 'Low'
    elif 0.2 <= row['reliability_score'] <= 0.4:
        return 'Medium'
    else:
        return 'High'

df_clean['reliability'] = df_clean.apply(classify_reliability, axis=1)

# Step 5: Encode ordinal classes
reliability_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
df_clean['reliability_ordinal'] = df_clean['reliability'].map(reliability_mapping)

# Step 6: Store data for dashboard
# Include airport, month, airline, and reliability score
dashboard_data = df_clean.copy()
dashboard_data.to_csv('dashboard_data_final.csv', index=False)

# Step 7: Print class distribution
print(df_clean['reliability'].value_counts())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['reliability'] = df_clean.apply(classify_reliability, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['reliability_ordinal'] = df_clean['reliability'].map(reliability_mapping)


reliability
Low       55595
Medium    54591
High       9228
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import pandas as pd

# Step 1: Define target variable (e.g., reliability_ordinal)
y = df_clean['reliability_ordinal']  

# Step 2: Select independent variables (include derived metrics and categorical variables)
X = df_clean[[
    'month', 'carrier_name', 'airport',  # Categorical variables
    'delay_ratio', 'avg_delay', 'cancel_ratio', 'divert_ratio',  # Derived metrics
    'carrier_delay_ratio', 'weather_delay_ratio', 'nas_delay_ratio', 
    'security_delay_ratio', 'late_aircraft_delay_ratio'  # Delay contributions
]]

# Step 3: Preprocess categorical variables (e.g., one-hot encoding)
X = pd.get_dummies(X, columns=['month', 'carrier_name', 'airport'], drop_first=True)

# Step 4: Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or 'most_frequent'
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Install if not already
from jupyter_dash import JupyterDash
from dash import dcc, html, Input, Output
import plotly.express as px
import pandas as pd

# Load your data
df = pd.read_csv("dashboard_data2.csv")

# Create the app
app = JupyterDash(__name__)

# App layout
app.layout = html.Div([
    html.Label("Airport"),
    dcc.Dropdown(
        options=[{"label": a, "value": a} for a in df["airport"].unique()],
        value=df["airport"].unique()[0],
        id="airport"
    ),
    html.Label("Carrier"),
    dcc.Dropdown(
        options=[{"label": c, "value": c} for c in df["carrier_name"].unique()],
        value=df["carrier_name"].unique()[0],
        id="carrier"
    ),
    html.Label("Month"), 
    dcc.Dropdown(
        options=[{"label": m, "value": m} for m in df["month"].unique()],
        value=df["month"].unique()[0],
        id="month"
    ),
    dcc.Graph(id="histogram")
])

# Callback
@app.callback(
    Output("histogram", "figure"),
    Input("airport", "value"),
    Input("carrier", "value"), 
    Input("month", "value")
)
def update(airport, carrier, month):
    dff = df[(df.airport == airport) & (df.carrier_name == carrier) & (df.month == month)]
    if dff.empty:
        return px.histogram(title="No data available")
    return px.histogram(
        dff, x="arr_delay", nbins=30,
        title=f"{carrier} @ {airport} in Month {month}"
    )

# Run the app inline
app.run(mode='inline', port=8050)



In [None]:
print("Hello")