<a href="https://colab.research.google.com/github/Rashimanish/USA-Flight-Prediction/blob/main/Google%20Colab/Multiclassifcation_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install joblib



In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# File path
file_path = '/content/drive/My Drive/Cleaned_Data/dataset09.csv'
# Read dataset
df = pd.read_csv(file_path)

Mounted at /content/drive


In [None]:
df.shape

(1802634, 24)

In [None]:
def drop_cols(df):
    columns_to_drop = [
        'YEAR',
        'QUARTER',
        'DEP_TIME',
        'ARR_TIME',
        'DEP_DEL15',
        'CRS_ARR_TIME',
        'ORIGIN_AIRPORT_ID',
        'DEST_AIRPORT_ID',
        'DEP_DELAY_NEW',
        'Maximum temperature',
        'Minimum temperature',
        'Precipitation',
        'Snowfall',
        'Average wind speed'
    ]

    df = df.drop(columns=columns_to_drop, errors='ignore')
    return df

In [None]:
df = drop_cols(df)

In [None]:
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,FL_DATE,ORIGIN,DEST,DISTANCE,ORIGIN_CARRIER,CRS_DEP_TIME,ARR_DELAY_NEW,ARR_DEL15
0,1,1,2022-01-01,ATL,CLT,226,DL,645,0.0,0.0
1,1,1,2022-01-01,MCO,ORD,1005,SW,1429,0.0,0.0
2,1,1,2022-01-01,MCO,ORD,1005,SW,1210,0.0,0.0
3,1,1,2022-01-01,MCO,ORD,1005,SW,1117,0.0,0.0
4,1,1,2022-01-01,MCO,ORD,1005,SW,1010,67.0,1.0


In [None]:
df = df.loc[(df['ARR_DEL15'] == 1) & (df['ARR_DELAY_NEW'] > 0)].copy()

In [None]:
def classify(num):
    if num <= 15:
        return 0
    elif num <= 20:
        return 1
    elif num <= 60:
        return 2
    elif num <= 120:
        return 3
    elif num <= 180:
        return 4
    elif num <= 240:
        return 5
    else:
        return 6

# Apply the classify function to ARR_DELAY_NEW
df['ARR_DELAY_NEW'] = df['ARR_DELAY_NEW'].apply(classify).astype('int32')

In [None]:
df = df.drop(columns=['ARR_DEL15'])

In [None]:
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,FL_DATE,ORIGIN,DEST,DISTANCE,ORIGIN_CARRIER,CRS_DEP_TIME,ARR_DELAY_NEW
4,1,1,2022-01-01,MCO,ORD,1005,SW,1010,3
6,1,1,2022-01-01,MCO,ORD,1005,SW,930,2
9,1,1,2022-01-01,MCO,MIA,192,SW,2315,4
18,1,1,2022-01-01,MCO,LAX,2218,SW,1935,4
27,1,1,2022-01-01,MCO,LAS,2039,SW,2246,5


## **Feature Engineering**

In [None]:
def feature_engineering(df):
    df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
    df['DAY_OF_WEEK'] = df['FL_DATE'].dt.dayofweek
    df['MONTH'] = df['FL_DATE'].dt.month
    df['HOUR_DEP'] = df['CRS_DEP_TIME'] // 100
    df['MIN_DEP'] = df['CRS_DEP_TIME'] % 100
    df['DISTANCE_BIN'] = pd.cut(df['DISTANCE'], bins=[0, 500, 1000, 1500, 2000, 3000], labels=[1, 2, 3, 4, 5])

    # Create interaction features
    df['DISTANCE_DAY_INTERACTION'] = df['DISTANCE'] * df['DAY_OF_WEEK']
    df['MONTH_DAY_INTERACTION'] = df['MONTH'] * df['DAY_OF_WEEK']
    cols_to_drop = ['FL_DATE', 'CRS_DEP_TIME']
    df = df.drop(columns=cols_to_drop, errors='ignore')

     # Convert categorical features to numeric using label encoding
    label_encoder = LabelEncoder()
    categorical_columns = ['ORIGIN', 'DEST', 'ORIGIN_CARRIER','DISTANCE_BIN']
    for col in categorical_columns:
        df[col] = label_encoder.fit_transform(df[col])


    return df

In [None]:
df = feature_engineering(df)

In [None]:
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,ORIGIN,DEST,DISTANCE,ORIGIN_CARRIER,ARR_DELAY_NEW,DAY_OF_WEEK,HOUR_DEP,MIN_DEP,DISTANCE_BIN,DISTANCE_DAY_INTERACTION,MONTH_DAY_INTERACTION
4,1,1,9,11,1005,3,3,5,10,10,2,5025,5
6,1,1,9,11,1005,3,2,5,9,30,2,5025,5
9,1,1,9,10,192,3,4,5,23,15,0,960,5
18,1,1,9,8,2218,3,4,5,19,35,4,11090,5
27,1,1,9,7,2039,3,5,5,22,46,4,10195,5


## **Trainning**

In [None]:
def sampling(X, y, seed=42):
    smote = SMOTE(random_state=seed)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

In [None]:
def evaluate_model(model, X, y):
    metrics_list = []
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=35)

    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate accuracy score for each class
        class_labels = np.unique(y)
        accuracy_class = {}
        for class_label in class_labels:
            y_test_class = (y_test == class_label).astype(int)
            y_pred_class = (y_pred == class_label).astype(int)
            accuracy_class[class_label] = accuracy_score(y_test_class, y_pred_class)

        # Calculate average accuracy score
        avg_accuracy = np.mean(list(accuracy_class.values()))

        metrics_list.append({'accuracy_class': accuracy_class, 'avg_accuracy': avg_accuracy})

    # Average metrics
    avg_accuracy_class = {}
    for class_label in np.unique(y):
        avg_accuracy_class[class_label] = np.mean([m['accuracy_class'][class_label] for m in metrics_list])
    avg_avg_accuracy = np.mean([m['avg_accuracy'] for m in metrics_list])

    return avg_accuracy_class, avg_avg_accuracy

In [None]:
# Separate features and target variable
X = df.drop('ARR_DELAY_NEW', axis=1).values
y = df['ARR_DELAY_NEW'].values

In [None]:
seed = 35
X_bal, y_bal = sampling(X, y, seed)

In [None]:
model = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=len(np.unique(y_bal)),
    metric='multi_logloss',
    boosting_type='gbdt',
    num_leaves=31,
    learning_rate=0.05
)

In [None]:
avg_accuracy_class, avg_avg_accuracy = evaluate_model(model, X_bal, y_bal)

print('Class-wise Accuracy:')
for class_label, accuracy in avg_accuracy_class.items():
    print(f'Class {class_label}: {accuracy:.2f}')
print(f'Average Accuracy: {avg_avg_accuracy:.2f}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054317 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 699
[LightGBM] [Info] Number of data points in the train set: 966429, number of used features: 12
[LightGBM] [Info] Start training from score -1.945905
[LightGBM] [Info] Start training from score -1.945912
[LightGBM] [Info] Start training from score -1.945912
[LightGBM] [Info] Start training from score -1.945912
[LightGBM] [Info] Start training from score -1.945905
[LightGBM] [Info] Start training from score -1.945912
[LightGBM] [Info] Start training from score -1.945912
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041906 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 706
[LightGBM]

In [None]:
# Define the Google Drive path
joblib_file = '/content/drive/My Drive/2024_FLIGHT/FLIGHT/mcl_model.pkl'

# Save the model
joblib.dump(model, joblib_file)
print(f"Model saved to {joblib_file}")

Model saved to /content/drive/My Drive/2024_FLIGHT/FLIGHT/mcl_model.pkl
