In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [2]:
# Load datasets

train = pd.read_csv('/kaggle/input/flight-delay-prediction/Train.csv')
test = pd.read_csv('/kaggle/input/flight-delay-prediction/Test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
train.head()
train['target'].describe()


Train shape: (107833, 10)
Test shape: (9333, 9)


count    107833.000000
mean         48.733013
std         117.135562
min           0.000000
25%           0.000000
50%          14.000000
75%          43.000000
max        3451.000000
Name: target, dtype: float64

In [3]:
# Separate features and target
X = train.drop(['target', 'id'], axis=1, errors='ignore')
y = train['target']
X_test = test.drop(['id'], axis=1, errors='ignore')

# Identify column types
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"\ Colonnes numériques: {len(numeric_cols)}")
print(f"  {numeric_cols}")
print(f"\ Colonnes catégorielles: {len(categorical_cols)}")
print(f"  {categorical_cols}")

\ Colonnes numériques: 0
  []
\ Colonnes catégorielles: 9
  ['ID', 'DATOP', 'FLTID', 'DEPSTN', 'ARRSTN', 'STD', 'STA', 'STATUS', 'AC']


In [4]:
# Handle missing values for numeric columns
for col in numeric_cols:
    if X[col].isnull().sum() > 0:
        median_val = X[col].median()
        X[col].fillna(median_val, inplace=True)
        X_test[col].fillna(median_val, inplace=True)
        print(f"  • {col}: rempli avec médiane ({median_val})")

In [5]:
# Encode categorical variables

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    
    # Combine train and test to ensure all categories are seen
    combined = pd.concat([X[col], X_test[col]]).astype(str)
    le.fit(combined)
    
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    
    label_encoders[col] = le
    print(f"  • {col}: {len(le.classes_)} catégories")

# Final check for any remaining NaN
X = X.fillna(0)
X_test = X_test.fillna(0)

print(f"  • X train: {X.shape}")
print(f"  • X test: {X_test.shape}")
print(f"  • Features: {X.columns.tolist()}")

  • ID: 117166 catégories
  • DATOP: 1096 catégories
  • FLTID: 1912 catégories
  • DEPSTN: 134 catégories
  • ARRSTN: 130 catégories
  • STD: 88709 catégories
  • STA: 92423 catégories
  • STATUS: 5 catégories
  • AC: 70 catégories
  • X train: (107833, 9)
  • X test: (9333, 9)
  • Features: ['ID', 'DATOP', 'FLTID', 'DEPSTN', 'ARRSTN', 'STD', 'STA', 'STATUS', 'AC']


In [7]:
# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\n Split:")
print(f"  • Train: {X_train.shape}")
print(f"  • Validation: {X_val.shape}")



 Split:
  • Train: (86266, 9)
  • Validation: (21567, 9)


In [8]:
models = {}
predictions = {}
rmse_scores = {}

# LightGBM
DEF: 
**LightGBM** is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:

* Faster training speed and higher efficiency.
* Lower memory usage.
* Better accuracy.
* Support of parallel, distributed, and GPU learning.
* Capable of handling large-scale data.

In [9]:

print("\n1️⃣ LightGBM...")
lgb_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)
lgb_model.fit(
    X_train, y_train, 
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
)
lgb_pred_val = lgb_model.predict(X_val)
lgb_rmse = np.sqrt(mean_squared_error(y_val, lgb_pred_val))
print(f" Validation RMSE: {lgb_rmse:.2f}")

models['LightGBM'] = lgb_model
predictions['LightGBM'] = lgb_model.predict(X_test)
rmse_scores['LightGBM'] = lgb_rmse


1️⃣ LightGBM...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 11390.4
 Validation RMSE: 106.73


# 2-XGBoost
In machine learning, XGBoost is a supervised ensemble learning algorithm. It combines the predictions of multiple "weak" models—typically shallow decision trees—to create a single "strong" predictive model. 
* **Gradient Boosting**: It builds trees sequentially, where each new tree is specifically designed to correct the errors (residuals) made by all previous trees.
* **Extreme**: The "Extreme" refers to the engineering optimizations that allow it to push the limits of computing power, making it much faster and more scalable than traditional gradient boosting machines (GBM

In [10]:

print("\n2️⃣ XGBoost...")
xgb_model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=0
)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=50,
    verbose=False
)
xgb_pred_val = xgb_model.predict(X_val)
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred_val))
print(f"   ✓ Validation RMSE: {xgb_rmse:.2f}")

models['XGBoost'] = xgb_model
predictions['XGBoost'] = xgb_model.predict(X_test)
rmse_scores['XGBoost'] = xgb_rmse


2️⃣ XGBoost...




   ✓ Validation RMSE: 105.23


# 3. Random Forest

In [11]:

print("\n3️⃣ Random Forest...")
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    verbose=0
)
rf_model.fit(X_train, y_train)
rf_pred_val = rf_model.predict(X_val)
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_pred_val))
print(f"   ✓ Validation RMSE: {rf_rmse:.2f}")

models['RandomForest'] = rf_model
predictions['RandomForest'] = rf_model.predict(X_test)
rmse_scores['RandomForest'] = rf_rmse


3️⃣ Random Forest...
   ✓ Validation RMSE: 106.06


# 4. Gradient Boosting

In [12]:
print("\n4️⃣ Gradient Boosting...")
gb_model = GradientBoostingRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    random_state=42,
    verbose=0
)
gb_model.fit(X_train, y_train)
gb_pred_val = gb_model.predict(X_val)
gb_rmse = np.sqrt(mean_squared_error(y_val, gb_pred_val))
print(f"   ✓ Validation RMSE: {gb_rmse:.2f}")

models['GradientBoosting'] = gb_model
predictions['GradientBoosting'] = gb_model.predict(X_test)
rmse_scores['GradientBoosting'] = gb_rmse




4️⃣ Gradient Boosting...
   ✓ Validation RMSE: 105.85


In [16]:
# Ensemble predictions
#Calculate weights (inverse of RMSE)
total_inv_rmse = sum(1/score for score in rmse_scores.values())
weights = {model: (1/score)/total_inv_rmse for model, score in rmse_scores.items()}

for model, weight in sorted(weights.items(), key=lambda x: x[1], reverse=True):
    print(f"   {model:20s}: {weight:.3f} (RMSE: {rmse_scores[model]:.2f})")

# Weighted ensemble prediction
ensemble_pred = np.zeros(len(X_test))
for model_name, weight in weights.items():
    ensemble_pred += predictions[model_name] * weight

# Calculate ensemble validation RMSE
ensemble_val_pred = np.zeros(len(X_val))
for model_name, weight in weights.items():
    ensemble_val_pred += models[model_name].predict(X_val) * weight

ensemble_rmse = np.sqrt(mean_squared_error(y_val, ensemble_val_pred))
print(f"\n Validation RMSE: {ensemble_rmse:.2f}")


   XGBoost             : 0.252 (RMSE: 105.23)
   GradientBoosting    : 0.250 (RMSE: 105.85)
   RandomForest        : 0.250 (RMSE: 106.06)
   LightGBM            : 0.248 (RMSE: 106.73)

 Validation RMSE: 104.17


In [17]:
# Create submission file
submission = pd.DataFrame({
    'ID': test['ID'],
    'target': ensemble_pred.round().astype(int)
})

submission.to_csv('submission.csv', index=False)

# Feature importance from best model
best_model_name = min(rmse_scores, key=rmse_scores.get)
best_model = models[best_model_name]

if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    for idx, row in feature_importance.head(15).iterrows():
        print(f"   {row['feature']:30s} {row['importance']:.4f}")


print(f" Fichier de soumission: submission.csv")


   STATUS                         0.3731
   STA                            0.1000
   FLTID                          0.0914
   STD                            0.0855
   DATOP                          0.0783
   DEPSTN                         0.0721
   AC                             0.0715
   ARRSTN                         0.0666
   ID                             0.0617
 Fichier de soumission: submission.csv
