In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42)

def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

csv_to_parquet('./train.csv', 'train')
csv_to_parquet('./test.csv', 'test')

train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

# Preprocessing
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']
for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    if col in test.columns:
        test[col] = test[col].fillna(mode)

qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']
for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])


train Done.
test Done.


In [3]:
# Feature Engineering
train = train.dropna()
train['is_Delayed'] = np.where(train['Delay']=="Not_Delayed", 1, 0)

# Model Training
X = train.drop(['ID','Delay', 'is_Delayed'], axis=1)
y = train['is_Delayed']
test = test.drop(['ID'], axis=1)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = []
for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    rf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42)
    rf.fit(X_train, y_train)

    val_preds = rf.predict_proba(X_val)[:,1]
    val_score = roc_auc_score(y_val, val_preds)

    print(f"Fold {fold} - ROC AUC: {val_score}")

    models.append(rf)

# Predictions on test data
test_preds = np.zeros((len(test), 2))
for model in models:
    test_preds += model.predict_proba(test) / len(models)

# Create submission file
test1 = pd.read_parquet('./test.parquet')
submission = pd.DataFrame({"ID": test1["ID"], "Not_Delayed": test_preds[:,0], "Delayed": test_preds[:,1]})
submission.to_csv("submission.csv", index=False)
print("Finished")

Fold 0 - ROC AUC: 0.6408576488919575
Fold 1 - ROC AUC: 0.6392094947089948
Fold 2 - ROC AUC: 0.6369136402116402
Fold 3 - ROC AUC: 0.6371019537037038
Fold 4 - ROC AUC: 0.6389656507936508
