In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

In [2]:
df = pd.read_parquet("/content/2018featured.snappy.parquet")

In [3]:
bins = [0, 6, 12, 18, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening']
df['SchdDepTimeOfDay'] = pd.cut(df['SchdDepHour'], bins=bins, labels=labels, right=False)

In [4]:
# Define features and target
X = df.drop("DelayCategory", axis=1)  # Features
y = df["DelayCategory"]  # Target

In [5]:
categorical_cols = ['Airline', 'Origin', 'Dest', 'OriginStateName', 'DestStateName','SchdDepTimeOfDay']

In [6]:
numerical_cols = [
    "SchdDepHour", "Year", "Quarter", "Month", "DayofMonth", "DayOfWeek",
    "TaxiOut", "WheelsOff", "WheelsOn", "TaxiIn", "SchdArrTime", "IsWeekend"
]

In [7]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),  # Scale numerical features
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),  # One-hot encode categorical features
    ]
)

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [11]:
# Convert target to numerical labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [12]:
%pip install xgboost pandas scikit-learn



In [13]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [14]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, classification_report

# Define the ANN model
def build_ann_model(input_shape, num_classes):
    model = Sequential()

    # Input layer and first hidden layer
    model.add(Dense(128, input_shape=(input_shape,), activation='relu'))
    # model.add(Dropout(0.2))  # Dropout for regularization

    # Second hidden layer
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))

    # Third hidden layer
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))

    # Output layer
    if num_classes == 2:
        model.add(Dense(1, activation='sigmoid'))  # Binary classification
    else:
        model.add(Dense(num_classes, activation='softmax'))  # Multi-class classification

    return model

# Define input shape and number of classes
input_shape = X_train.shape[1]  # Number of features
num_classes = len(np.unique(y_train))  # Number of unique classes in the target variable

# Build the model
ann_model = build_ann_model(input_shape, num_classes)

# Compile the model
if num_classes == 2:
    ann_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
else:
    ann_model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = ann_model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=1,
    batch_size=32,
    verbose=1
)

# Evaluate the model
y_pred_ann = ann_model.predict(X_test)
if num_classes == 2:
    y_pred_ann = (y_pred_ann > 0.5).astype(int)  # Convert probabilities to binary predictions
else:
    y_pred_ann = y_pred_ann.argmax(axis=1)  # Convert probabilities to class labels

# Calculate accuracy
accuracy_ann = accuracy_score(y_test, y_pred_ann)
print(f"ANN Accuracy: {accuracy_ann:.4f}")

# Print classification report
print(classification_report(y_test, y_pred_ann))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m139466/139466[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1506s[0m 11ms/step - accuracy: 0.9050 - loss: 0.2507 - val_accuracy: 0.9392 - val_loss: 0.1495
[1m34867/34867[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 4ms/step
ANN Accuracy: 0.9392
              precision    recall  f1-score   support

           0       0.64      0.47      0.54     15427
           1       0.75      0.57      0.65     44524
           2       0.98      0.98      0.98    898467
           3       0.78      0.86      0.82    145066
           4       0.95      0.72      0.82     12240

    accuracy                           0.94   1115724
   macro avg       0.82      0.72      0.76   1115724
weighted avg       0.94      0.94      0.94   1115724



In [20]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred_ann, average='weighted')  # Use 'macro' or 'micro' as needed
f1

0.9377682799242122

In [16]:
# Save the trained model in .h5 format
ann_model.save('ann_model.h5')
print("Model saved as 'ann_model.h5'")



Model saved as 'ann_model.h5'
