In [None]:
!pip install seaborn



In [None]:
import pandas as pd

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

# Load the dataset
df = pd.read_csv('Airline_Delay_Cause.csv')

# Inspect the dataset
print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Missing Values:\n", df.isnull().sum())

# Define the new target variable
target = 'arr_del15'

# Check if the chosen target column exists
if target not in df.columns:
    raise ValueError(f"Target column '{target}' not found in the dataset. Please verify the column name.")

print(f"\nTarget Variable Unique Values for '{target}':\n", df[target].unique())
print(f"Target Variable Value Counts for '{target}':\n", df[target].value_counts())

# Step 1: No specific time-based feature engineering like hour/day is possible
# with the current dataset's time columns. 'year' and 'month' will be used directly.

# Step 2: Encode Categorical Variables
# One-hot encode new nominal variables
nominal_cols = ['carrier', 'carrier_name', 'airport', 'airport_name']
nominal_cols = [col for col in nominal_cols if col in df.columns] # Ensure they exist

if nominal_cols:
    df = pd.get_dummies(df, columns=nominal_cols, drop_first=True)
    print("\nCategorical columns after One-Hot Encoding:", [col for col in df.columns if any(nc in col for nc in nominal_cols)])
else:
    print("\nNo nominal columns found for one-hot encoding based on the predefined list.")

# No LabelEncoder for 'Status' or 'Category' as they are not in the dataset

# Step 3: Define Features and Target
# Columns to drop - these are either the target itself or represent delay outcomes (leakage)
# or are not suitable as direct features for predicting a delay occurrence.
drop_cols = [
    'arr_del15',        # This is our target, so it must be dropped from features (X)
    'arr_delay',        # Total arrival delay - leakage for arr_del15
    'carrier_ct',       # Carrier delay count - leakage
    'weather_ct',       # Weather delay count - leakage
    'nas_ct',           # NAS delay count - leakage
    'security_ct',      # Security delay count - leakage
    'late_aircraft_ct', # Late aircraft delay count - leakage
    'carrier_delay',    # Specific carrier delay - leakage
    'weather_delay',    # Specific weather delay - leakage
    'nas_delay',        # Specific NAS delay - leakage
    'security_delay',   # Specific security delay - leakage
    'late_aircraft_delay', # Specific late aircraft delay - leakage
    'arr_cancelled',    # Number of cancelled arrivals - could be an outcome of severe delay
    'arr_diverted'      # Number of diverted arrivals - could be an outcome of severe delay
]

# Ensure only existing columns are in the drop_cols list
drop_cols = [col for col in drop_cols if col in df.columns]

X = df.drop(columns=drop_cols)
y = df[target]

print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print("First 5 rows of X after dropping and encoding:\n", X.head())
print("First 5 rows of y:\n", y.head())


# Step 4: Scale Numerical Features
numerical_cols = [
    'year',
    'month',
    'arr_flights' # Number of scheduled flights
]

# Ensure all numerical_cols are actually in X's columns before scaling
numerical_cols = [col for col in numerical_cols if col in X.columns]

if numerical_cols: # Only scale if there are numerical columns
    scaler = StandardScaler()
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
    print(f"\nScaled numerical columns: {numerical_cols}")
    print("First 5 rows of X after scaling:\n", X.head())
else:
    print("No numerical columns found to scale based on the predefined list.")
    scaler = None # Define scaler as None if no scaling is done

# Save the preprocessed DataFrame (features X and target y are now defined)
# For consistency, saving X and y separately might be more conventional post-preprocessing.
# However, to match the spirit of saving 'df' previously, we'll recreate a preprocessed df here
# Note: This df will be very wide due to one-hot encoding.
preprocessed_df = pd.concat([X, y], axis=1)
preprocessed_df.to_csv('/content/Preprocessed_Airline_Delay_Data.csv', index=False)
print("\nPreprocessed data (features and target combined) saved to '/content/Preprocessed_Airline_Delay_Data.csv'")

Dataset Shape: (171666, 21)
Columns: ['year', 'month', 'carrier', 'carrier_name', 'airport', 'airport_name', 'arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted', 'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
Missing Values:
 year                     0
month                    0
carrier                  0
carrier_name             0
airport                  0
airport_name             0
arr_flights            240
arr_del15              443
carrier_ct             240
weather_ct             240
nas_ct                 240
security_ct            240
late_aircraft_ct       240
arr_cancelled          240
arr_diverted           240
arr_delay              240
carrier_delay          240
weather_delay          240
nas_delay              240
security_delay         240
late_aircraft_delay    240
dtype: int64

Target Variable Unique Values for 'arr_del15':
 [  13.   1

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
# Correcting the file path to directly access the uploaded file
df = pd.read_csv('Airline_Delay_Cause.csv')

# Inspect the dataset
print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Missing Values (before dropping NaNs in target):\n", df.isnull().sum())

# Define the new target variable
target = 'arr_del15'

# Check if the chosen target column exists
if target not in df.columns:
    raise ValueError(f"Target column '{target}' not found in the dataset. Please verify the column name.")

# Handle missing values in the target variable by dropping rows
initial_rows = df.shape[0]
df.dropna(subset=[target], inplace=True)
rows_after_dropping_target_nan = df.shape[0]
print(f"\nDropped {initial_rows - rows_after_dropping_target_nan} rows with NaN values in the target column '{target}'.")
print(f"Dataset Shape after dropping target NaNs: {df.shape}")


print(f"\nTarget Variable Unique Values for '{target}':\n", df[target].unique())
print(f"Target Variable Value Counts for '{target}':\n", df[target].value_counts())

# It's a good practice to ensure the target is int if it should be discrete classes.
# The previous diagnostic showed it has many unique values, which is why we convert to int.
df[target] = df[target].astype(int)

# --- End of Block 1 ---

Dataset Shape: (171666, 21)
Columns: ['year', 'month', 'carrier', 'carrier_name', 'airport', 'airport_name', 'arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted', 'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
Missing Values (before dropping NaNs in target):
 year                     0
month                    0
carrier                  0
carrier_name             0
airport                  0
airport_name             0
arr_flights            240
arr_del15              443
carrier_ct             240
weather_ct             240
nas_ct                 240
security_ct            240
late_aircraft_ct       240
arr_cancelled          240
arr_diverted           240
arr_delay              240
carrier_delay          240
weather_delay          240
nas_delay              240
security_delay         240
late_aircraft_delay    240
dtype: int64

Dropped 443 rows with NaN

In [None]:
# Block 2: Encode Categorical Variables
# Step 1: No specific time-based feature engineering like hour/day is possible
# with the current dataset's time columns. 'year' and 'month' will be used directly.

# Step 2: Encode Categorical Variables
# One-hot encode new nominal variables
nominal_cols = ['carrier', 'carrier_name', 'airport', 'airport_name']
nominal_cols = [col for col in nominal_cols if col in df.columns] # Ensure they exist

if nominal_cols:
    # Convert numerical columns to string type if they are to be treated as categorical for one-hot encoding
    for col in nominal_cols:
        if df[col].dtype == 'int64' or df[col].dtype == 'float64':
            df[col] = df[col].astype(str)

    df = pd.get_dummies(df, columns=nominal_cols, drop_first=True)
    print("\nCategorical columns after One-Hot Encoding:", [col for col in df.columns if any(nc in col for nc in nominal_cols)])
else:
    print("\nNo nominal columns found for one-hot encoding based on the predefined list.")

# --- End of Block 2 ---


Categorical columns after One-Hot Encoding: ['carrier_ct', 'carrier_delay', 'carrier_AA', 'carrier_AS', 'carrier_B6', 'carrier_DL', 'carrier_EV', 'carrier_F9', 'carrier_FL', 'carrier_G4', 'carrier_HA', 'carrier_MQ', 'carrier_NK', 'carrier_OH', 'carrier_OO', 'carrier_QX', 'carrier_UA', 'carrier_US', 'carrier_VX', 'carrier_WN', 'carrier_YV', 'carrier_YX', 'carrier_name_Alaska Airlines Inc.', 'carrier_name_Allegiant Air', 'carrier_name_American Airlines Inc.', 'carrier_name_American Eagle Airlines Inc.', 'carrier_name_Delta Air Lines Inc.', 'carrier_name_Endeavor Air Inc.', 'carrier_name_Envoy Air', 'carrier_name_ExpressJet Airlines Inc.', 'carrier_name_ExpressJet Airlines LLC', 'carrier_name_Frontier Airlines Inc.', 'carrier_name_Hawaiian Airlines Inc.', 'carrier_name_Horizon Air', 'carrier_name_JetBlue Airways', 'carrier_name_Mesa Airlines Inc.', 'carrier_name_PSA Airlines Inc.', 'carrier_name_Republic Airline', 'carrier_name_SkyWest Airlines Inc.', 'carrier_name_Southwest Airlines Co.

In [None]:
# Block 3: Define Features (X), Target (y), and Scale Numerical Features

# Step 3: Define Features and Target
# Columns to drop - these are either the target itself or represent delay outcomes (leakage)
# or are not suitable as direct features for predicting a delay occurrence.
drop_cols = [
    'arr_del15',        # This is our target, so it must be dropped from features (X)
    'arr_delay',        # Total arrival delay - leakage for arr_del15
    'carrier_ct',       # Carrier delay count - leakage
    'weather_ct',       # Weather delay count - leakage
    'nas_ct',           # NAS delay count - leakage
    'security_ct',      # Security delay count - leakage
    'late_aircraft_ct', # Late aircraft delay count - leakage
    'carrier_delay',    # Specific carrier delay - leakage
    'weather_delay',    # Specific weather delay - leakage
    'nas_delay',        # Specific NAS delay - leakage
    'security_delay',   # Specific security delay - leakage
    'late_aircraft_delay', # Specific late aircraft delay - leakage
    'arr_cancelled',    # Number of cancelled arrivals - could be an outcome of severe delay
    'arr_diverted'      # Number of diverted arrivals - could be an outcome of severe delay
]

# Ensure only existing columns are in the drop_cols list
drop_cols = [col for col in drop_cols if col in df.columns]

X = df.drop(columns=drop_cols)
y = df[target] # 'target' is already defined from Block 1

print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print("First 5 rows of X after dropping and encoding:\n", X.head())
print("First 5 rows of y:\n", y.head())

# Step 4: Scale Numerical Features
numerical_cols = [
    'year',
    'month',
    'arr_flights' # Number of scheduled flights
]

# Ensure all numerical_cols are actually in X's columns before scaling
numerical_cols = [col for col in numerical_cols if col in X.columns]

if numerical_cols: # Only scale if there are numerical columns
    scaler = StandardScaler()
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
    print(f"\nScaled numerical columns: {numerical_cols}")
    print("First 5 rows of X after scaling:\n", X.head())
else:
    print("No numerical columns found to scale based on the predefined list.")
    scaler = None # Define scaler as None if no scaling is done

# --- End of Block 3 ---


Features (X) shape: (171223, 849)
Target (y) shape: (171223,)
First 5 rows of X after dropping and encoding:
    year  month  arr_flights  carrier_AA  carrier_AS  carrier_B6  carrier_DL  \
0  2023      8         89.0       False       False       False       False   
1  2023      8         62.0       False       False       False       False   
2  2023      8         62.0       False       False       False       False   
3  2023      8         66.0       False       False       False       False   
4  2023      8         92.0       False       False       False       False   

   carrier_EV  carrier_F9  carrier_FL  ...  \
0       False       False       False  ...   
1       False       False       False  ...   
2       False       False       False  ...   
3       False       False       False  ...   
4       False       False       False  ...   

   airport_name_Williston, ND: Sloulin Field International  \
0                                              False         
1            

In [None]:
# Block 4: Train & Evaluate the Model

# Step 5: Train the Model
print("\n--- Training the Model ---")
# Split the data into training and testing sets
# Ensure X also does not contain any NaNs or non-numeric values that would cause issues.
# After one-hot encoding, X might have new NaN if original categorical columns had NaNs and were not handled.
# A final dropna for X before splitting to be safe.
X.dropna(inplace=True)
y = y.loc[X.index] # Ensure y aligns with X after dropping NaNs from X

# *** DIAGNOSTIC STEP: Check value counts of y right before splitting ***
print(f"\nValue counts of target variable (y) before train_test_split:\n{y.value_counts()}")

# Check if any class has only 1 member before splitting
if (y.value_counts() == 1).any():
    print("\nWarning: One or more classes in the target variable have only 1 member. Stratified split cannot be performed reliably.")
    print("Proceeding without stratification for main train_test_split.")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Removed stratify=y
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # Stratify for balanced classes


print(f"Original Training features shape: {X_train.shape}")
print(f"Original Training target shape: {y_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Testing target shape: {y_test.shape}")

# --- Downsample the training data to reduce memory usage for fitting ---
# We'll take 20% of the X_train and y_train for actual model training
# Use a new variable name to avoid confusion with the full X_train
# Removed stratify=y_train from this downsampling step to avoid "least populated class" error
_, X_train_subset, _, y_train_subset = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print(f"\nDownsampled Training features shape: {X_train_subset.shape}")
print(f"Downsampled Training target shape: {y_train_subset.shape}")

# Initialize and train a RandomForestClassifier model
# Using the downsampled subset for training
model = RandomForestClassifier(n_estimators=20, random_state=42, class_weight='balanced')
model.fit(X_train_subset, y_train_subset) # Fit on the subset

print("\nModel training complete.")

# Step 6: Evaluate the Model
print("\n--- Evaluating the Model ---")
# Make predictions on the full X_test (not the downsampled part)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


--- Training the Model ---

Value counts of target variable (y) before train_test_split:
arr_del15
0       6585
1       6525
2       6288
4       6178
3       6106
        ... 
4024       1
2121       1
3206       1
1814       1
1264       1
Name: count, Length: 1781, dtype: int64

Proceeding without stratification for main train_test_split.
Original Training features shape: (136978, 849)
Original Training target shape: (136978,)
Testing features shape: (34245, 849)
Testing target shape: (34245,)

Downsampled Training features shape: (27396, 849)
Downsampled Training target shape: (27396,)

Model training complete.

--- Evaluating the Model ---
Accuracy: 0.06173163965542415

Classification Report:
               precision    recall  f1-score   support

           0       0.34      0.41      0.37      1335
           1       0.16      0.18      0.17      1249
           2       0.12      0.14      0.13      1261
           3       0.10      0.12      0.11      1227
           4       0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
# Assuming sklearn components (StandardScaler, RandomForestClassifier) are already imported from previous blocks

# --- 1. Define your new input data ---
# Create a DataFrame for your new flight(s).
# Make sure the column names match your original dataset's feature columns before one-hot encoding.
# Example data (you can change these values):
new_data = pd.DataFrame([{
    'year': 2024,
    'month': 5,
    'carrier': 'AA',
    'carrier_name': 'American Airlines',
    'airport': 'LAX',
    'airport_name': 'Los Angeles International',
    'arr_flights': 100 # Example number of flights
    # Only include feature columns that were used in X, not target or leakage columns
}])

print("--- New Input Data ---")
print(new_data)

# --- 2. Apply the same preprocessing steps to the new data ---

# A. Handle nominal categorical columns (One-Hot Encoding)
# Ensure nominal_cols is defined from your previous Block 2
nominal_cols_for_new_data = ['carrier', 'carrier_name', 'airport', 'airport_name']
nominal_cols_for_new_data = [col for col in nominal_cols_for_new_data if col in new_data.columns]

if nominal_cols_for_new_data:
    for col in nominal_cols_for_new_data:
        if new_data[col].dtype == 'int64' or new_data[col].dtype == 'float64':
            new_data[col] = new_data[col].astype(str)
    new_data_processed = pd.get_dummies(new_data, columns=nominal_cols_for_new_data, drop_first=True)
else:
    new_data_processed = new_data.copy()

# B. Align columns with the training data (X)
# This is crucial because your training data (X) had 849 columns after one-hot encoding.
# You need to ensure the new_data_processed has the exact same columns in the ew data and get a prediction, you need to follow these steps:

Prepare your new input data: It must have the same structxact same order.
# We will use the columns from your original X DataFrame (before train_test_split and downsampling).
# Assuming 'X' (the full preprocessed features from Block 3) is still available in your session.
# If not, you might need to run Block 3 first or re-extract X.columns from there.
if 'X' in locals(): # Check if X is defined in the current environment
    expected_columns = X.columns # Use the columns from the X DataFrame created in Block 3
else:
    print("Warning: 'X' DataFrame not found. Cannot ensure exact column alignment. Please run Block 3 first.")
    # Fallback (less robust, but might work if X is not available)
    # You might need to manually list all 849 columns if X is not in memory.
    # For now, we'll assume X is available.
    raise NameError("X DataFrame not found. Please run Block 3 to define X before predicting.")


# Reindex new_data_processed to match the columns of the training data
# Any columns in expected_columns not in new_data_processed will be added as False/0
# Any columns in new_data_processed not in expected_columns will be dropped
new_data_aligned = new_data_processed.reindex(columns=expected_columns, fill_value=0)

# C. Scale numerical features
# Ensure numerical_cols and scaler are defined from your previous Block 3
numerical_cols_for_scaling = [
    'year',
    'month',
    'arr_flights'
]
numerical_cols_for_scaling = [col for col in numerical_cols_for_scaling if col in new_data_aligned.columns]

if numerical_cols_for_scaling:
    # Use the *fitted* scaler object from training
    new_data_aligned[numerical_cols_for_scaling] = scaler.transform(new_data_aligned[numerical_cols_for_scaling])
else:
    print("No numerical columns found in new data to scale.")

print("\n--- Processed Input Data for Prediction (aligned and scaled) ---")
print(new_data_aligned)


# --- 3. Make Prediction ---
# Ensure 'model' is defined from your previous Block 4 (after it was trained)
if 'model' not in locals():
    raise NameError("Model not found. Please run Block 4 to train the model first.")

prediction = model.predict(new_data_aligned)

# --- 4. Interpret the Output ---
print("\n--- Prediction Results ---")
for i, pred_val in enumerate(prediction):
    if pred_val > 0:
        status = "Delayed"
    else:
        status = "Not Delayed"
    print(f"Flight {i+1} (arr_del15 prediction: {pred_val}): {status}")

--- New Input Data ---
   year  month carrier       carrier_name airport               airport_name  \
0  2024      5      AA  American Airlines     LAX  Los Angeles International   

   arr_flights  
0          100  

--- Processed Input Data for Prediction (aligned and scaled) ---
       year     month  arr_flights  carrier_AA  carrier_AS  carrier_B6  \
0  1.885052 -0.434455    -0.264692           0           0           0   

   carrier_DL  carrier_EV  carrier_F9  carrier_FL  ...  \
0           0           0           0           0  ...   

   airport_name_Williston, ND: Sloulin Field International  \
0                                                  0         

   airport_name_Williston, ND: Williston Basin International  \
0                                                  0           

   airport_name_Wilmington, DE: New Castle  \
0                                        0   

   airport_name_Wilmington, NC: Wilmington International  \
0                                          