In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, precision_score, recall_score

In [None]:
# Load and combine data
df_2019 = pd.read_csv('/kaggle/input/flight-delay-prediction/Jan_2019_ontime.csv')
df_2020 = pd.read_csv('/kaggle/input/flight-delay-prediction/Jan_2020_ontime.csv')
df_2019['year'], df_2020['year'] = 2019, 2020
data = pd.concat([df_2019, df_2020])

In [None]:
# Preprocess data
data.dropna(inplace=True)
data['DISTANCE_cat'] = pd.qcut(data['DISTANCE'], q=4)
data['DEP_DEL15'] = data['DEP_DEL15'].astype('category')
data['ARR_DEL15'] = data['ARR_DEL15'].astype('category')
categorical_cols = ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'DEP_TIME_BLK', 'CANCELLED', 'DIVERTED', 'DISTANCE_cat']

# One-hot encode categorical variables
encoder = OneHotEncoder()
cat_encoded = encoder.fit_transform(data[categorical_cols])

# Split data into train (2019) and test (2020)
X_train = cat_encoded[data['year'] == 2019]
X_test = cat_encoded[data['year'] == 2020]
y_train = data.loc[data['year'] == 2019, 'ARR_DEL15']
y_test = data.loc[data['year'] == 2020, 'ARR_DEL15']

# Train Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"AUC: {roc_auc_score(y_test, y_pred_prob):.4f}")

# Precision-Recall tradeoff by manipulating threshold
threshold = -3
y_scores = model.decision_function(X_test)
y_pred_threshold = (y_scores > threshold)
print(f"New Precision: {precision_score(y_test, y_pred_threshold):.4f}")
print(f"New Recall: {recall_score(y_test, y_pred_threshold):.4f}")