# Random Forest

Random Forest Regressor

In [None]:
#!/usr/bin/env python
# coding: utf-8

from google.colab import drive
import pandas as pd
import glob
import os
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from datetime import datetime, timedelta

# -------------------------------------
# Mount Google Drive
# -------------------------------------
drive.mount('/content/drive')

# Adjust these paths according to your directory structure
train_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/train"
test_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/test"
submission_format_file = "drive/MyDrive/Colab Notebooks/dataset/FUSER/submission_format.csv"
output_submission_file = "NASAsubmission.csv"

# -------------------------------------
# Load Training Data
# -------------------------------------
train_files = glob.glob(os.path.join(train_path, "*.csv"))
train_dfs = []
for f in train_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    train_dfs.append(df)

train_df = pd.concat(train_dfs, ignore_index=True)
print("Training DataFrame shape:", train_df.shape)

# -------------------------------------
# Load Test Data
# -------------------------------------
test_files = glob.glob(os.path.join(test_path, "*.csv"))
test_dfs = []
for f in test_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    test_dfs.append(df)

test_df = pd.concat(test_dfs, ignore_index=True)
print("Test DataFrame shape:", test_df.shape)

# -------------------------------------
# Identify Target and Feature Columns
# -------------------------------------
target_cols = [f'interval_{i}_Arrival' for i in range(1, 13)]
exclude_cols = ['airport_id', 'ref_time'] + target_cols
all_cols = train_df.columns.tolist()
feature_cols = [c for c in all_cols if c not in exclude_cols]

# Drop rows without target in training
train_df = train_df.dropna(subset=target_cols)

# Fill missing features
train_df[feature_cols] = train_df[feature_cols].fillna(-1)

# Sort training data by time for potential time-based validation
train_df = train_df.sort_values(by='ref_time')
train_fraction = 0.8
split_idx = int(len(train_df)*train_fraction)
tr = train_df.iloc[:split_idx]
va = train_df.iloc[split_idx:]

X_train = tr[feature_cols]
y_train = tr[target_cols]

X_valid = va[feature_cols]
y_valid = va[target_cols]

# -------------------------------------
# Train a RandomForest Multi-Output Model
# -------------------------------------
base_model = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
model = MultiOutputRegressor(base_model)
model.fit(X_train, y_train)

val_preds = model.predict(X_valid)
val_rmse = np.sqrt(((y_valid.values - val_preds)**2).mean())
print("Validation RMSE:", val_rmse)

# -------------------------------------
# Process Submission Format and Predict
# -------------------------------------
submission_format = pd.read_csv(submission_format_file)

# The ID format: AIRPORT_YYMMDD_HHMM_OFFSET
# We need to group by the base time (AIRPORT, YYMMDD, HHMM)
# and predict the intervals (15,30,...180 min ahead).

def parse_id(ID):
    # Example: KDEN_220925_0100_15
    # split by '_'
    parts = ID.split('_')
    airport = parts[0]
    date_str = parts[1]  # YYMMDD
    time_str = parts[2]  # HHMM
    offset = int(parts[3])

    # Construct datetime from YYMMDD and HHMM
    # Assume year 20YY
    base_dt = datetime.strptime(date_str+time_str, "%y%m%d%H%M")
    return airport, base_dt, offset

# We'll create a dict to store all predictions
id_to_pred = {}

# Extract unique (airport, base_dt) pairs from submission_format
submission_format['parsed'] = submission_format['ID'].apply(parse_id)
# (airport, base_dt, offset)
# Group by (airport, base_dt)
grouped = submission_format.groupby(lambda i: (submission_format.at[i,'parsed'][0], submission_format.at[i,'parsed'][1]))

# We need to predict once for each (airport, base_dt) pair:
# Input data: [base_dt - 1h, base_dt)
# Predict for [base_dt, base_dt+3h] in 15-min intervals.

test_df = test_df.sort_values(['airport_id','ref_time'])
test_df[feature_cols] = test_df[feature_cols].fillna(-1)

for (airport, base_dt), indices in grouped.groups.items():
    # Find test data rows in [base_dt - 1 hour, base_dt)
    start_time = base_dt - timedelta(hours=1)
    end_time = base_dt

    condition = (
        (test_df['airport_id'] == airport) &
        (test_df['ref_time'] >= start_time) &
        (test_df['ref_time'] < end_time)
    )
    hour_block = test_df[condition]

    if hour_block.empty:
        # No data found for this hour, fill with defaults
        input_features = np.array([-1]*len(feature_cols)).reshape(1, -1)
    else:
        # Use the last row in this hour-block as the input state
        last_row = hour_block.iloc[-1]
        input_features = last_row[feature_cols].values.reshape(1, -1)

    # Predict intervals
    preds = model.predict(input_features)[0]  # shape (12,)

    # Fill id_to_pred
    # submission_format has rows with offsets 15,30,...180 for this (airport, base_dt).
    # We know offset = i*15 for intervals i=1..12
    # Let's map these predictions back to each ID
    # IDs in these indices share the same (airport, base_dt)
    sub_rows = submission_format.loc[indices]
    for idx, row in sub_rows.iterrows():
        offset = row['parsed'][2]
        # interval i where offset = i*15 => i = offset/15
        i = offset // 15
        # i runs from 1 to 12
        pred_val = preds[i-1]
        id_to_pred[row['ID']] = pred_val

# Map predictions to submission
submission_format['Value'] = submission_format['ID'].map(id_to_pred)
# If some IDs are missing predictions, fill with 0
submission_format['Value'] = submission_format['Value'].fillna(0)
submission_format['Value'] = submission_format['Value'].round().astype(int)

# Drop the 'parsed' column before saving
submission_format = submission_format[['ID', 'Value']]

submission_format.to_csv(output_submission_file, index=False)
print("Submission saved to", output_submission_file)


# CatBoost Model

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
#!/usr/bin/env python
# coding: utf-8

# If CatBoost is not installed, uncomment the following line:
# !pip install catboost

from google.colab import drive
import pandas as pd
import glob
import os
import numpy as np
from catboost import CatBoostRegressor
from datetime import datetime, timedelta

# -------------------------------------
# Mount Google Drive
# -------------------------------------
drive.mount('/content/drive')

# Adjust these paths according to your directory structure
train_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/train"
test_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/test"
submission_format_file = "drive/MyDrive/Colab Notebooks/dataset/FUSER/submission_format.csv"
output_submission_file = "NASAsubmission_CatBoost.csv"

# -------------------------------------
# Load Training Data
# -------------------------------------
train_files = glob.glob(os.path.join(train_path, "*.csv"))
train_dfs = []
for f in train_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    train_dfs.append(df)

train_df = pd.concat(train_dfs, ignore_index=True)
print("Training DataFrame shape:", train_df.shape)

# -------------------------------------
# Load Test Data
# -------------------------------------
test_files = glob.glob(os.path.join(test_path, "*.csv"))
test_dfs = []
for f in test_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    test_dfs.append(df)

test_df = pd.concat(test_dfs, ignore_index=True)
print("Test DataFrame shape:", test_df.shape)

# -------------------------------------
# Identify Target and Feature Columns
# -------------------------------------
target_cols = [f'interval_{i}_Arrival' for i in range(1, 13)]
exclude_cols = ['airport_id', 'ref_time'] + target_cols
all_cols = train_df.columns.tolist()
feature_cols = [c for c in all_cols if c not in exclude_cols]

# Drop rows without target in training
train_df = train_df.dropna(subset=target_cols)

# Fill missing features
train_df[feature_cols] = train_df[feature_cols].fillna(-1)

# Sort training data by time for potential time-based validation
train_df = train_df.sort_values(by='ref_time')
train_fraction = 0.8
split_idx = int(len(train_df)*train_fraction)
tr = train_df.iloc[:split_idx]
va = train_df.iloc[split_idx:]

X_train = tr[feature_cols]
y_train = tr[target_cols]
X_valid = va[feature_cols]
y_valid = va[target_cols]

# -------------------------------------
# Train One CatBoost Model per Target Interval
# -------------------------------------
models = {}
for i, tcol in enumerate(target_cols, start=1):
    print(f"Training model for {tcol}...")
    # Create the CatBoostRegressor
    # We set eval_metric='RMSE', use early stopping (od_wait=50), and best model selection.
    cat_model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        eval_metric='RMSE',
        random_seed=42,
        od_wait=50,
        use_best_model=True,
        verbose=False
    )

    # Fit the model on this particular target
    cat_model.fit(
        X_train, y_train[tcol],
        eval_set=(X_valid, y_valid[tcol])
    )
    models[tcol] = cat_model

# Evaluate on validation set by predicting all intervals
val_preds = np.zeros(y_valid.shape)
for i, tcol in enumerate(target_cols):
    val_preds[:, i] = models[tcol].predict(X_valid)

val_rmse = np.sqrt(((y_valid.values - val_preds)**2).mean())
print("Validation RMSE:", val_rmse)

# -------------------------------------
# Process Submission Format and Predict
# -------------------------------------
submission_format = pd.read_csv(submission_format_file)

def parse_id(ID):
    # Example: KDEN_220925_0100_15
    parts = ID.split('_')
    airport = parts[0]
    date_str = parts[1]  # YYMMDD
    time_str = parts[2]  # HHMM
    offset = int(parts[3])
    base_dt = datetime.strptime(date_str+time_str, "%y%m%d%H%M")
    return airport, base_dt, offset

id_to_pred = {}

submission_format['parsed'] = submission_format['ID'].apply(parse_id)
grouped = submission_format.groupby(lambda i: (submission_format.at[i,'parsed'][0], submission_format.at[i,'parsed'][1]))

test_df = test_df.sort_values(['airport_id','ref_time'])
test_df[feature_cols] = test_df[feature_cols].fillna(-1)

for (airport, base_dt), indices in grouped.groups.items():
    start_time = base_dt - timedelta(hours=1)
    end_time = base_dt

    condition = (
        (test_df['airport_id'] == airport) &
        (test_df['ref_time'] >= start_time) &
        (test_df['ref_time'] < end_time)
    )
    hour_block = test_df[condition]

    if hour_block.empty:
        input_features = np.array([-1]*len(feature_cols)).reshape(1, -1)
    else:
        last_row = hour_block.iloc[-1]
        input_features = last_row[feature_cols].values.reshape(1, -1)

    # Predict all intervals using the trained CatBoost models
    preds = []
    for i, tcol in enumerate(target_cols, start=1):
        p = models[tcol].predict(input_features)[0]
        preds.append(p)
    preds = np.array(preds)

    sub_rows = submission_format.loc[indices]
    for idx, row in sub_rows.iterrows():
        offset = row['parsed'][2]
        i = offset // 15
        pred_val = preds[i-1]
        id_to_pred[row['ID']] = pred_val

submission_format['Value'] = submission_format['ID'].map(id_to_pred)
submission_format['Value'] = submission_format['Value'].fillna(0)
submission_format['Value'] = submission_format['Value'].round().astype(int)

submission_format = submission_format[['ID', 'Value']]
submission_format.to_csv(output_submission_file, index=False)
print("Submission saved to", output_submission_file)







Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training DataFrame shape: (97034, 61)
Test DataFrame shape: (5280, 61)
Training model for interval_1_Arrival...
Training model for interval_2_Arrival...
Training model for interval_3_Arrival...
Training model for interval_4_Arrival...
Training model for interval_5_Arrival...
Training model for interval_6_Arrival...
Training model for interval_7_Arrival...
Training model for interval_8_Arrival...
Training model for interval_9_Arrival...
Training model for interval_10_Arrival...
Training model for interval_11_Arrival...
Training model for interval_12_Arrival...
Validation RMSE: 20.64216254389649
Submission saved to NASAsubmission_CatBoost.csv


# Improved CatBoost

In [None]:
!pip install catboost --upgrade

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
#!/usr/bin/env python
# coding: utf-8

# If CatBoost is not installed, uncomment the following line:
# !pip install catboost --upgrade

from google.colab import drive
import pandas as pd
import glob
import os
import numpy as np
from catboost import CatBoostRegressor
from datetime import datetime, timedelta
import math

# -------------------------------------
# Mount Google Drive
# -------------------------------------
drive.mount('/content/drive')

# Adjust these paths according to your directory structure
train_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/train"
test_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/test"
submission_format_file = "drive/MyDrive/Colab Notebooks/dataset/FUSER/submission_format.csv"
output_submission_file = "NASAsubmission_CatBoost_Improved.csv"

# -------------------------------------
# Load Training Data
# -------------------------------------
train_files = glob.glob(os.path.join(train_path, "*.csv"))
train_dfs = []
for f in train_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    train_dfs.append(df)

train_df = pd.concat(train_dfs, ignore_index=True)
print("Training DataFrame shape:", train_df.shape)

# -------------------------------------
# Load Test Data
# -------------------------------------
test_files = glob.glob(os.path.join(test_path, "*.csv"))
test_dfs = []
for f in test_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    test_dfs.append(df)

test_df = pd.concat(test_dfs, ignore_index=True)
print("Test DataFrame shape:", test_df.shape)

# -------------------------------------
# Feature Engineering
# -------------------------------------
def add_time_features(df):
    # Extract basic time features
    df['hour'] = df['ref_time'].dt.hour
    df['day_of_week'] = df['ref_time'].dt.dayofweek
    df['month'] = df['ref_time'].dt.month

    # Cyclical encoding for hour_of_day
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24.0)

    # Cyclical encoding for day_of_week (0-6)
    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7.0)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7.0)

    return df

train_df = add_time_features(train_df)
test_df = add_time_features(test_df)

# -------------------------------------
# Add Lag Features
# Lag features capture recent historical arrivals which can help predict future arrivals.
# We will use interval_1_Arrival as a proxy for short-term trend.
# You can add more lags or lags for other intervals if desired.
for lag in [1, 2, 3]:  # 3 previous steps of 15-min intervals
    train_df[f'lag_{lag}_arr'] = train_df.groupby('airport_id')['interval_1_Arrival'].shift(lag)
    test_df[f'lag_{lag}_arr'] = test_df.groupby('airport_id')['interval_1_Arrival'].shift(lag)

# Fill missing lag values with -1
train_df.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)

# -------------------------------------
# Identify Target and Feature Columns
# -------------------------------------
target_cols = [f'interval_{i}_Arrival' for i in range(1, 13)]
exclude_cols = ['airport_id', 'ref_time'] + target_cols
all_cols = train_df.columns.tolist()
feature_cols = [c for c in all_cols if c not in exclude_cols]

# Drop rows without target in training
train_df = train_df.dropna(subset=target_cols)

# Sort training data by time for time-based validation
train_df = train_df.sort_values(by='ref_time')

# -------------------------------------
# Prepare Training and Validation Sets
# -------------------------------------
train_fraction = 0.8
split_idx = int(len(train_df)*train_fraction)
tr = train_df.iloc[:split_idx]
va = train_df.iloc[split_idx:]

X_train = tr[feature_cols].copy()
y_train = tr[target_cols].copy()
X_valid = va[feature_cols].copy()
y_valid = va[target_cols].copy()

# Apply log transform to targets to stabilize variance
y_train_trans = np.log1p(y_train)
y_valid_trans = np.log1p(y_valid)

# -------------------------------------
# Train a Single CatBoost Model for All Targets
# CatBoost supports multi-target regression using `loss_function='MultiRMSE'`
# -------------------------------------
cat_model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.03,
    depth=7,
    eval_metric='MultiRMSE',
    random_seed=42,
    od_wait=100,
    use_best_model=True,
    verbose=50,
    loss_function='MultiRMSE'
)

cat_model.fit(
    X_train, y_train_trans,
    eval_set=(X_valid, y_valid_trans)
)

# Evaluate on validation set
val_preds_trans = cat_model.predict(X_valid)
# Transform predictions back
val_preds = np.expm1(val_preds_trans)

val_rmse = np.sqrt(np.mean((y_valid.values - val_preds)**2))
print("Validation RMSE:", val_rmse)

# -------------------------------------
# Process Submission Format and Predict
# -------------------------------------
submission_format = pd.read_csv(submission_format_file)

def parse_id(ID):
    # Example: KDEN_220925_0100_15
    parts = ID.split('_')
    airport = parts[0]
    date_str = parts[1]  # YYMMDD
    time_str = parts[2]  # HHMM
    offset = int(parts[3])
    base_dt = datetime.strptime(date_str+time_str, "%y%m%d%H%M")
    return airport, base_dt, offset

submission_format['parsed'] = submission_format['ID'].apply(parse_id)
grouped = submission_format.groupby(lambda i: (submission_format.at[i,'parsed'][0], submission_format.at[i,'parsed'][1]))

test_df = test_df.sort_values(['airport_id','ref_time'])

# Ensure no missing features in test
test_df[feature_cols] = test_df[feature_cols].fillna(-1)

id_to_pred = {}

for (airport, base_dt), indices in grouped.groups.items():
    start_time = base_dt - timedelta(hours=1)
    end_time = base_dt

    condition = (
        (test_df['airport_id'] == airport) &
        (test_df['ref_time'] >= start_time) &
        (test_df['ref_time'] < end_time)
    )
    hour_block = test_df[condition]

    if hour_block.empty:
        input_features = np.array([-1]*len(feature_cols)).reshape(1, -1)
    else:
        last_row = hour_block.iloc[-1]
        input_features = last_row[feature_cols].values.reshape(1, -1)

    # Predict all intervals using the trained CatBoost model
    preds_trans = cat_model.predict(input_features)  # shape (1,12)
    preds_final = np.expm1(preds_trans)  # revert log transform

    sub_rows = submission_format.loc[indices]
    for idx, row in sub_rows.iterrows():
        offset = row['parsed'][2]
        # offset is multiple of 15 minutes: 15->interval_1,30->2,...180->12
        i = offset // 15
        pred_val = preds_final[0, i-1]
        # Round and ensure int
        pred_val = int(round(pred_val))
        id_to_pred[row['ID']] = pred_val

submission_format['Value'] = submission_format['ID'].map(id_to_pred)
submission_format['Value'] = submission_format['Value'].fillna(0)
submission_format['Value'] = submission_format['Value'].astype(int)

submission_format = submission_format[['ID', 'Value']]
submission_format.to_csv(output_submission_file, index=False)
print("Submission saved to", output_submission_file)


Mounted at /content/drive
Training DataFrame shape: (97034, 61)
Test DataFrame shape: (5280, 61)
0:	learn: 5.3488974	test: 5.6287032	best: 5.6287032 (0)	total: 209ms	remaining: 6m 58s
50:	learn: 2.9234501	test: 3.3427249	best: 3.3427249 (50)	total: 7.8s	remaining: 4m 57s
100:	learn: 2.4490455	test: 2.9778211	best: 2.9778211 (100)	total: 15.1s	remaining: 4m 44s
150:	learn: 2.2634019	test: 2.8676636	best: 2.8676636 (150)	total: 22.7s	remaining: 4m 38s
200:	learn: 2.1537741	test: 2.7948338	best: 2.7948338 (200)	total: 30.1s	remaining: 4m 29s
250:	learn: 2.0725161	test: 2.7473446	best: 2.7473446 (250)	total: 37.7s	remaining: 4m 22s
300:	learn: 2.0086270	test: 2.7196582	best: 2.7196582 (300)	total: 45.1s	remaining: 4m 14s
350:	learn: 1.9555522	test: 2.6981976	best: 2.6981976 (350)	total: 52.6s	remaining: 4m 7s
400:	learn: 1.9082601	test: 2.6830220	best: 2.6825472 (396)	total: 1m	remaining: 3m 59s
450:	learn: 1.8706027	test: 2.6650825	best: 2.6650825 (450)	total: 1m 7s	remaining: 3m 52s
500:

In [None]:
#!/usr/bin/env python
# coding: utf-8

# If CatBoost is not installed, uncomment the following line:
# !pip install catboost --upgrade

from google.colab import drive
import pandas as pd
import glob
import os
import numpy as np
from catboost import CatBoostRegressor
from datetime import datetime, timedelta

# -------------------------------------
# Mount Google Drive (Adjust as needed)
# -------------------------------------
drive.mount('/content/drive')

# Adjust these paths according to your directory structure
train_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/train"
test_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/test"
submission_format_file = "drive/MyDrive/Colab Notebooks/dataset/FUSER/submission_format.csv"
output_submission_file = "NASAsubmission_CatBoost_Improved_V3.csv"

# -------------------------------------
# Load Training Data
# -------------------------------------
train_files = glob.glob(os.path.join(train_path, "*.csv"))
train_dfs = []
for f in train_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    train_dfs.append(df)

train_df = pd.concat(train_dfs, ignore_index=True)
print("Training DataFrame shape:", train_df.shape)

# -------------------------------------
# Load Test Data
# -------------------------------------
test_files = glob.glob(os.path.join(test_path, "*.csv"))
test_dfs = []
for f in test_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    test_dfs.append(df)

test_df = pd.concat(test_dfs, ignore_index=True)
print("Test DataFrame shape:", test_df.shape)

# -------------------------------------
# Feature Engineering: Time-based features (optional, but often helpful)
# -------------------------------------
def add_time_features(df):
    # Extract basic time features
    df['hour'] = df['ref_time'].dt.hour
    df['day_of_week'] = df['ref_time'].dt.dayofweek
    df['month'] = df['ref_time'].dt.month

    # Cyclical encoding for hour_of_day
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24.0)

    # Cyclical encoding for day_of_week (0-6)
    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7.0)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7.0)
    return df

train_df = add_time_features(train_df)
test_df = add_time_features(test_df)

# -------------------------------------
# Add Lag and Rolling Features
# -------------------------------------
# We assume that data is recorded at 15-minute intervals.
# We'll create lag features for interval_1_Arrival for the last 4 intervals (1 hour).
# We'll also create rolling mean and std over the last hour of interval_1_Arrival.

def add_lag_rolling_features(df):
    df = df.sort_values(['airport_id', 'ref_time'])

    # Create lag features using transform
    g = df.groupby('airport_id')['interval_1_Arrival']
    df['lag_1_arr'] = g.transform(lambda x: x.shift(1))
    df['lag_2_arr'] = g.transform(lambda x: x.shift(2))
    df['lag_3_arr'] = g.transform(lambda x: x.shift(3))
    df['lag_4_arr'] = g.transform(lambda x: x.shift(4))

    # Rolling features using transform as well
    df['roll_1h_mean_arr'] = g.transform(lambda x: x.shift(1).rolling(4).mean())
    df['roll_1h_std_arr'] = g.transform(lambda x: x.shift(1).rolling(4).std())

    return df


train_df = add_lag_rolling_features(train_df)
test_df = add_lag_rolling_features(test_df)

# Fill missing values introduced by shifting and rolling
train_df.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)

# -------------------------------------
# Identify Target and Feature Columns
# -------------------------------------
target_cols = [f'interval_{i}_Arrival' for i in range(1, 13)]
exclude_cols = ['airport_id', 'ref_time'] + target_cols
all_cols = train_df.columns.tolist()
feature_cols = [c for c in all_cols if c not in exclude_cols]

# Drop rows without target in training
train_df = train_df.dropna(subset=target_cols)

# Sort training data by time for time-based validation
train_df = train_df.sort_values(by='ref_time')

# -------------------------------------
# Prepare Training and Validation Sets
# -------------------------------------
train_fraction = 0.8
split_idx = int(len(train_df)*train_fraction)
tr = train_df.iloc[:split_idx]
va = train_df.iloc[split_idx:]

X_train = tr[feature_cols].copy()
y_train = tr[target_cols].copy()
X_valid = va[feature_cols].copy()
y_valid = va[target_cols].copy()

# Apply log transform to targets to stabilize variance
y_train_trans = np.log1p(y_train)
y_valid_trans = np.log1p(y_valid)

# -------------------------------------
# Train a Single CatBoost Model for All Targets
# -------------------------------------
cat_model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.03,
    depth=7,
    eval_metric='MultiRMSE',
    random_seed=42,
    od_wait=200,
    use_best_model=True,
    verbose=50,
    loss_function='MultiRMSE'
)

cat_model.fit(
    X_train, y_train_trans,
    eval_set=(X_valid, y_valid_trans)
)

# Evaluate on validation set
val_preds_trans = cat_model.predict(X_valid)
val_preds = np.expm1(val_preds_trans)
val_rmse = np.sqrt(np.mean((y_valid.values - val_preds)**2))
print("Validation RMSE:", val_rmse)

# -------------------------------------
# Process Submission Format and Predict
# -------------------------------------
submission_format = pd.read_csv(submission_format_file)

def parse_id(ID):
    # Example: KDEN_220925_0100_15
    parts = ID.split('_')
    airport = parts[0]
    date_str = parts[1]  # YYMMDD
    time_str = parts[2]  # HHMM
    offset = int(parts[3])
    base_dt = datetime.strptime(date_str+time_str, "%y%m%d%H%M")
    return airport, base_dt, offset

submission_format['parsed'] = submission_format['ID'].apply(parse_id)
grouped = submission_format.groupby(lambda i: (submission_format.at[i,'parsed'][0], submission_format.at[i,'parsed'][1]))

test_df = test_df.sort_values(['airport_id','ref_time'])
test_df[feature_cols] = test_df[feature_cols].fillna(-1)

id_to_pred = {}

for (airport, base_dt), indices in grouped.groups.items():
    start_time = base_dt - timedelta(hours=1)
    end_time = base_dt

    condition = (
        (test_df['airport_id'] == airport) &
        (test_df['ref_time'] >= start_time) &
        (test_df['ref_time'] < end_time)
    )
    hour_block = test_df[condition]

    if hour_block.empty:
        input_features = np.array([-1]*len(feature_cols)).reshape(1, -1)
    else:
        # Take the last row of available data before prediction time
        last_row = hour_block.iloc[-1]
        input_features = last_row[feature_cols].values.reshape(1, -1)

    # Predict all intervals using the trained CatBoost model
    preds_trans = cat_model.predict(input_features)  # shape (1,12)
    preds_final = np.expm1(preds_trans)  # revert log transform

    sub_rows = submission_format.loc[indices]
    for idx, row in sub_rows.iterrows():
        offset = row['parsed'][2]
        # offset is multiple of 15 minutes: 15->interval_1,30->2,...180->12
        i = offset // 15
        pred_val = preds_final[0, i-1]
        pred_val = int(round(pred_val))
        id_to_pred[row['ID']] = pred_val

submission_format['Value'] = submission_format['ID'].map(id_to_pred)
submission_format['Value'] = submission_format['Value'].fillna(0)
submission_format['Value'] = submission_format['Value'].astype(int)

submission_format = submission_format[['ID', 'Value']]
submission_format.to_csv(output_submission_file, index=False)
print("Submission saved to", output_submission_file)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training DataFrame shape: (97034, 61)
Test DataFrame shape: (5280, 61)
0:	learn: 5.3501075	test: 5.6297443	best: 5.6297443 (0)	total: 161ms	remaining: 5m 21s
50:	learn: 2.8488866	test: 3.1625870	best: 3.1625870 (50)	total: 8.04s	remaining: 5m 7s
100:	learn: 2.3658794	test: 2.7508969	best: 2.7508969 (100)	total: 15.6s	remaining: 4m 52s
150:	learn: 2.1874053	test: 2.6356379	best: 2.6356379 (150)	total: 23.4s	remaining: 4m 46s
200:	learn: 2.0806689	test: 2.5961678	best: 2.5890694 (188)	total: 31s	remaining: 4m 37s
250:	learn: 2.0046565	test: 2.5558866	best: 2.5558866 (250)	total: 38.9s	remaining: 4m 31s
300:	learn: 1.9470762	test: 2.5399908	best: 2.5399908 (300)	total: 46.6s	remaining: 4m 23s
350:	learn: 1.8998575	test: 2.5224719	best: 2.5224719 (350)	total: 54.7s	remaining: 4m 17s
400:	learn: 1.8592359	test: 2.5190570	best: 2.5187098 (359)	total: 1m 2s	remainin

# CatBoost Improved with lag and rolling features with weather

1.   Added lag and rolling features for selected weather columns.
2.   Added additional lag and rolling windows for arrival data.
3.   Introduced simple binary flags for severe weather conditions.
4.   Included lags for runway features, as these might influence operational capacity and thus arrivals.
5.   As always, after adding new features, we fill in missing values.



In [None]:
!pip install catboost --upgrade

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
#!/usr/bin/env python
# coding: utf-8

# If CatBoost is not installed, uncomment the following line:
# !pip install catboost --upgrade

from google.colab import drive
import pandas as pd
import glob
import os
import numpy as np
from catboost import CatBoostRegressor
from datetime import datetime, timedelta

# -------------------------------------
# Mount Google Drive (Adjust as needed)
# -------------------------------------
drive.mount('/content/drive')

# Adjust these paths according to your directory structure
train_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/train"
test_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/test"
submission_format_file = "drive/MyDrive/Colab Notebooks/dataset/FUSER/submission_format.csv"
output_submission_file = "NASAsubmission_CatBoost_Improved_Weather_V4.csv"

# -------------------------------------
# Load Training Data
# -------------------------------------
train_files = glob.glob(os.path.join(train_path, "*.csv"))
train_dfs = []
for f in train_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    train_dfs.append(df)

train_df = pd.concat(train_dfs, ignore_index=True)
print("Training DataFrame shape:", train_df.shape)

# -------------------------------------
# Load Test Data
# -------------------------------------
test_files = glob.glob(os.path.join(test_path, "*.csv"))
test_dfs = []
for f in test_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    test_dfs.append(df)

test_df = pd.concat(test_dfs, ignore_index=True)
print("Test DataFrame shape:", test_df.shape)

# -------------------------------------
# Time-Based Features
# -------------------------------------
def add_time_features(df):
    df['hour'] = df['ref_time'].dt.hour
    df['day_of_week'] = df['ref_time'].dt.dayofweek
    df['month'] = df['ref_time'].dt.month

    # Cyclical encoding for hour_of_day
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24.0)

    # Cyclical encoding for day_of_week (0-6)
    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7.0)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7.0)
    return df

train_df = add_time_features(train_df)
test_df = add_time_features(test_df)

# -------------------------------------
# Add Lag and Rolling Features
# -------------------------------------
def add_lag_rolling_features(df):
    df = df.sort_values(['airport_id', 'ref_time'])

    # Arrival-based features
    g_arr = df.groupby('airport_id')['interval_1_Arrival']

    # More extensive lags for arrivals (up to 2 hours back = 8 intervals)
    for lag in [1,2,3,4,5,6,7,8]:
        df[f'lag_{lag}_arr'] = g_arr.transform(lambda x: x.shift(lag))

    # Rolling stats for arrivals at different windows
    df['roll_1h_mean_arr'] = g_arr.transform(lambda x: x.shift(1).rolling(4).mean())
    df['roll_1h_std_arr'] = g_arr.transform(lambda x: x.shift(1).rolling(4).std())

    df['roll_30m_mean_arr'] = g_arr.transform(lambda x: x.shift(1).rolling(2).mean())
    df['roll_30m_std_arr'] = g_arr.transform(lambda x: x.shift(1).rolling(2).std())

    df['roll_2h_mean_arr'] = g_arr.transform(lambda x: x.shift(1).rolling(8).mean())
    df['roll_2h_std_arr'] = g_arr.transform(lambda x: x.shift(1).rolling(8).std())

    # Differences to capture trends
    df['diff_arr_1h'] = df['interval_1_Arrival'] - df['lag_4_arr']
    df['diff_arr_2h'] = df['interval_1_Arrival'] - df['lag_8_arr']

    # Weather-related columns (adjust based on available columns)
    weather_cols = ['temperature', 'wind_speed', 'wind_gust', 'visibility', 'cloud_ceiling', 'precip']
    # If you have wind_direction, consider using sin/cos transforms similarly to time.
    # For cloud, if it's categorical, consider encoding it or using it as is.

    for wcol in weather_cols:
        g_w = df.groupby('airport_id')[wcol]

        # Lags for weather features
        for lag in [1,2,4,8]:  # These represent 15 min, 30 min, 1 hr, 2 hrs lags
            df[f'lag_{lag}_{wcol}'] = g_w.transform(lambda x: x.shift(lag))

        # Rolling features for weather
        # 1-hour rolling mean & std
        df[f'roll_1h_mean_{wcol}'] = g_w.transform(lambda x: x.shift(1).rolling(4).mean())
        df[f'roll_1h_std_{wcol}'] = g_w.transform(lambda x: x.shift(1).rolling(4).std())

        # 2-hour rolling mean & max (example)
        df[f'roll_2h_mean_{wcol}'] = g_w.transform(lambda x: x.shift(1).rolling(8).mean())
        df[f'roll_2h_max_{wcol}'] = g_w.transform(lambda x: x.shift(1).rolling(8).max())

    # Binary flags for severe conditions (example thresholds)
    df['low_visibility_flag'] = (df['visibility'] < 2.0).astype(int)
    df['high_wind_gust_flag'] = (df['wind_gust'] > 20.0).astype(int)

    # Runway usage features
    # Lags for arrival_runways and departure_runways
    for lag in [1,4]:
        df[f'lag_{lag}_arrival_runways'] = df.groupby('airport_id')['arrival_runways'].transform(lambda x: x.shift(lag))
        df[f'lag_{lag}_departure_runways'] = df.groupby('airport_id')['departure_runways'].transform(lambda x: x.shift(lag))

    # Rolling mean of runways in the last hour
    df['roll_1h_mean_arrival_runways'] = df.groupby('airport_id')['arrival_runways'].transform(lambda x: x.shift(1).rolling(4).mean())
    df['roll_1h_mean_departure_runways'] = df.groupby('airport_id')['departure_runways'].transform(lambda x: x.shift(1).rolling(4).mean())

    return df

train_df = add_lag_rolling_features(train_df)
test_df = add_lag_rolling_features(test_df)

# Fill missing values introduced by shifting and rolling
train_df.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)

# -------------------------------------
# Identify Target and Feature Columns
# -------------------------------------
target_cols = [f'interval_{i}_Arrival' for i in range(1, 13)]
exclude_cols = ['airport_id', 'ref_time'] + target_cols
all_cols = train_df.columns.tolist()
feature_cols = [c for c in all_cols if c not in exclude_cols]

# Drop rows without target in training
train_df = train_df.dropna(subset=target_cols)

# Sort training data by time for time-based validation
train_df = train_df.sort_values(by='ref_time')

# -------------------------------------
# Prepare Training and Validation Sets
# -------------------------------------
train_fraction = 0.8
split_idx = int(len(train_df)*train_fraction)
tr = train_df.iloc[:split_idx]
va = train_df.iloc[split_idx:]

X_train = tr[feature_cols].copy()
y_train = tr[target_cols].copy()
X_valid = va[feature_cols].copy()
y_valid = va[target_cols].copy()

# Apply log transform to targets to stabilize variance
y_train_trans = np.log1p(y_train)
y_valid_trans = np.log1p(y_valid)

# -------------------------------------
# Train a Single CatBoost Model for All Targets
# -------------------------------------
cat_model = CatBoostRegressor(
    iterations=5000,
    learning_rate=0.01,
    depth=7,
    l2_leaf_reg=5,       # Add regularization
    subsample=0.8,       # Sample 90% of rows for each tree
    rsm=0.8,             # Sample 90% of features at each split
    bootstrap_type='Bernoulli',
    eval_metric='MultiRMSE',
    random_seed=42,
    od_wait=200,
    use_best_model=True,
    verbose=50,
    loss_function='MultiRMSE'
)

cat_model.fit(
    X_train, y_train_trans,
    eval_set=(X_valid, y_valid_trans)
)

# Evaluate on validation set
val_preds_trans = cat_model.predict(X_valid)
val_preds = np.expm1(val_preds_trans)
val_rmse = np.sqrt(np.mean((y_valid.values - val_preds)**2))
print("Validation RMSE:", val_rmse)

# -------------------------------------
# Process Submission Format and Predict
# -------------------------------------
submission_format = pd.read_csv(submission_format_file)

def parse_id(ID):
    # Example: KDEN_220925_0100_15
    parts = ID.split('_')
    airport = parts[0]
    date_str = parts[1]  # YYMMDD
    time_str = parts[2]  # HHMM
    offset = int(parts[3])
    base_dt = datetime.strptime(date_str+time_str, "%y%m%d%H%M")
    return airport, base_dt, offset

submission_format['parsed'] = submission_format['ID'].apply(parse_id)
grouped = submission_format.groupby(lambda i: (submission_format.at[i,'parsed'][0], submission_format.at[i,'parsed'][1]))

test_df = test_df.sort_values(['airport_id','ref_time'])
test_df[feature_cols] = test_df[feature_cols].fillna(-1)

id_to_pred = {}

for (airport, base_dt), indices in grouped.groups.items():
    start_time = base_dt - timedelta(hours=1)
    end_time = base_dt

    condition = (
        (test_df['airport_id'] == airport) &
        (test_df['ref_time'] >= start_time) &
        (test_df['ref_time'] < end_time)
    )
    hour_block = test_df[condition]

    if hour_block.empty:
        input_features = np.array([-1]*len(feature_cols)).reshape(1, -1)
    else:
        # Take the last row of available data before prediction time
        last_row = hour_block.iloc[-1]
        input_features = last_row[feature_cols].values.reshape(1, -1)

    # Predict all intervals using the trained CatBoost model
    preds_trans = cat_model.predict(input_features)  # shape (1,12)
    preds_final = np.expm1(preds_trans)  # revert log transform

    sub_rows = submission_format.loc[indices]
    for idx, row in sub_rows.iterrows():
        offset = row['parsed'][2]
        # offset is multiple of 15 minutes: 15->interval_1,30->2,...180->12
        i = offset // 15
        pred_val = preds_final[0, i-1]
        pred_val = int(round(pred_val))
        id_to_pred[row['ID']] = pred_val

submission_format['Value'] = submission_format['ID'].map(id_to_pred)
submission_format['Value'] = submission_format['Value'].fillna(0)
submission_format['Value'] = submission_format['Value'].astype(int)

submission_format = submission_format[['ID', 'Value']]
submission_format.to_csv(output_submission_file, index=False)
print("Submission saved to", output_submission_file)


Mounted at /content/drive
Training DataFrame shape: (97034, 61)
Test DataFrame shape: (5280, 61)
0:	learn: 5.4231967	test: 5.6983092	best: 5.6983092 (0)	total: 261ms	remaining: 21m 45s
50:	learn: 4.0413327	test: 4.3225806	best: 4.3225806 (50)	total: 11s	remaining: 17m 44s
100:	learn: 3.2584720	test: 3.5420610	best: 3.5420610 (100)	total: 21.3s	remaining: 17m 14s
150:	learn: 2.8186425	test: 3.1022905	best: 3.1022905 (150)	total: 31.9s	remaining: 17m 4s
200:	learn: 2.5684776	test: 2.8614348	best: 2.8614348 (200)	total: 42.4s	remaining: 16m 51s
250:	learn: 2.4126476	test: 2.7146611	best: 2.7146611 (250)	total: 52.6s	remaining: 16m 34s
300:	learn: 2.3064522	test: 2.6157189	best: 2.6157189 (300)	total: 1m 3s	remaining: 16m 23s
350:	learn: 2.2293908	test: 2.5447722	best: 2.5447722 (350)	total: 1m 13s	remaining: 16m 13s
400:	learn: 2.1694531	test: 2.4917004	best: 2.4917004 (400)	total: 1m 23s	remaining: 16m 1s
450:	learn: 2.1206004	test: 2.4508010	best: 2.4508010 (450)	total: 1m 34s	remaining

# LightGBM

In [None]:
!pip install --upgrade lightgbm




In [None]:
#!/usr/bin/env python
# coding: utf-8

# If LightGBM is not installed, uncomment the following line:
# !pip install --upgrade lightgbm


from google.colab import drive
import pandas as pd
import glob
import os
import numpy as np
from lightgbm import LGBMRegressor
from lightgbm import early_stopping
from datetime import datetime, timedelta

# -------------------------------------
# Mount Google Drive
# -------------------------------------
drive.mount('/content/drive')

# Adjust these paths according to your directory structure
train_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/train"
test_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/test"
submission_format_file = "drive/MyDrive/Colab Notebooks/dataset/FUSER/submission_format.csv"
output_submission_file = "NASAsubmission_lightgbm_V1.csv"

# -------------------------------------
# Load Training Data
# -------------------------------------
train_files = glob.glob(os.path.join(train_path, "*.csv"))
train_dfs = []
for f in train_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    train_dfs.append(df)

train_df = pd.concat(train_dfs, ignore_index=True)
print("Training DataFrame shape:", train_df.shape)

# -------------------------------------
# Load Test Data
# -------------------------------------
test_files = glob.glob(os.path.join(test_path, "*.csv"))
test_dfs = []
for f in test_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    test_dfs.append(df)

test_df = pd.concat(test_dfs, ignore_index=True)
print("Test DataFrame shape:", test_df.shape)

# -------------------------------------
# Identify Target and Feature Columns
# -------------------------------------
target_cols = [f'interval_{i}_Arrival' for i in range(1, 13)]
exclude_cols = ['airport_id', 'ref_time'] + target_cols
all_cols = train_df.columns.tolist()
feature_cols = [c for c in all_cols if c not in exclude_cols]

# Drop rows without target in training
train_df = train_df.dropna(subset=target_cols)

# Fill missing features
train_df[feature_cols] = train_df[feature_cols].fillna(-1)

# Sort training data by time for potential time-based validation
train_df = train_df.sort_values(by='ref_time')
train_fraction = 0.8
split_idx = int(len(train_df)*train_fraction)
tr = train_df.iloc[:split_idx]
va = train_df.iloc[split_idx:]

X_train = tr[feature_cols]
y_train = tr[target_cols]
X_valid = va[feature_cols]
y_valid = va[target_cols]

# -------------------------------------
# Train One LightGBM Model per Target Interval
# -------------------------------------
models = {}
for i, tcol in enumerate(target_cols, start=1):
    print(f"Training model for {tcol}...")
    lgb_model = LGBMRegressor(
        n_estimators=200,
        learning_rate=0.05,
        random_state=42,
        n_jobs=-1
    )
    lgb_model.fit(X_train, y_train[tcol],
                  eval_set=[(X_valid, y_valid[tcol])],
                  eval_metric='rmse',
                  callbacks=[early_stopping(50)]
    )

    models[tcol] = lgb_model

# Evaluate on validation set by predicting all intervals
val_preds = np.zeros(y_valid.shape)
for i, tcol in enumerate(target_cols):
    val_preds[:, i] = models[tcol].predict(X_valid)

val_rmse = np.sqrt(((y_valid.values - val_preds)**2).mean())
print("Validation RMSE:", val_rmse)

# -------------------------------------
# Process Submission Format and Predict
# -------------------------------------
submission_format = pd.read_csv(submission_format_file)

def parse_id(ID):
    # Example: KDEN_220925_0100_15
    parts = ID.split('_')
    airport = parts[0]
    date_str = parts[1]  # YYMMDD
    time_str = parts[2]  # HHMM
    offset = int(parts[3])
    base_dt = datetime.strptime(date_str+time_str, "%y%m%d%H%M")
    return airport, base_dt, offset

id_to_pred = {}

submission_format['parsed'] = submission_format['ID'].apply(parse_id)
grouped = submission_format.groupby(lambda i: (submission_format.at[i,'parsed'][0], submission_format.at[i,'parsed'][1]))

test_df = test_df.sort_values(['airport_id','ref_time'])
test_df[feature_cols] = test_df[feature_cols].fillna(-1)

for (airport, base_dt), indices in grouped.groups.items():
    start_time = base_dt - timedelta(hours=1)
    end_time = base_dt

    condition = (
        (test_df['airport_id'] == airport) &
        (test_df['ref_time'] >= start_time) &
        (test_df['ref_time'] < end_time)
    )
    hour_block = test_df[condition]

    if hour_block.empty:
        input_features = np.array([-1]*len(feature_cols)).reshape(1, -1)
    else:
        last_row = hour_block.iloc[-1]
        input_features = last_row[feature_cols].values.reshape(1, -1)

    # Predict all intervals using the trained LightGBM models
    preds = []
    for i, tcol in enumerate(target_cols, start=1):
        p = models[tcol].predict(input_features)[0]
        preds.append(p)
    preds = np.array(preds)

    sub_rows = submission_format.loc[indices]
    for idx, row in sub_rows.iterrows():
        offset = row['parsed'][2]
        i = offset // 15
        pred_val = preds[i-1]
        id_to_pred[row['ID']] = pred_val

submission_format['Value'] = submission_format['ID'].map(id_to_pred)
submission_format['Value'] = submission_format['Value'].fillna(0)
submission_format['Value'] = submission_format['Value'].round().astype(int)

submission_format = submission_format[['ID', 'Value']]
submission_format.to_csv(output_submission_file, index=False)
print("Submission saved to", output_submission_file)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Mounted at /content/drive
Training DataFrame shape: (97034, 61)
Test DataFrame shape: (5280, 61)
Training model for interval_1_Arrival...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10104
[LightGBM] [Info] Number of data points in the train set: 77627, number of used features: 47
[LightGBM] [Info] Start training from score 32.744007
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[198]	valid_0's rmse: 21.3631	valid_0's l2: 456.383
Training model for interval_2_Arrival...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10104
[LightGBM] [Info] Number of data points in the train set: 77627, number of used features: 47
[LightGBM] [Info] Start tr

# To-Be-Improved LightGBM

In [None]:
!pip install --upgrade lightgbm scikit-learn
!pip install dask[dataframe]



In [None]:
#!/usr/bin/env python
# coding: utf-8

# If LightGBM or scikit-learn is not installed or not up-to-date:
# !pip install --upgrade lightgbm scikit-learn

from google.colab import drive
import pandas as pd
import glob
import os
import numpy as np
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from datetime import datetime, timedelta
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

# -------------------------------------
# Mount Google Drive
# -------------------------------------
drive.mount('/content/drive')

# Adjust paths according to your directory structure
train_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/train"
test_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/test"
submission_format_file = "drive/MyDrive/Colab Notebooks/dataset/FUSER/submission_format.csv"
output_submission_file = "NASAsubmission_lightgbm_improved_V2.csv"

# -------------------------------------
# Load Training Data
# -------------------------------------
train_files = glob.glob(os.path.join(train_path, "*.csv"))
train_dfs = []
for f in train_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    train_dfs.append(df)

train_df = pd.concat(train_dfs, ignore_index=True)
print("Training DataFrame shape:", train_df.shape)

# -------------------------------------
# Load Test Data
# -------------------------------------
test_files = glob.glob(os.path.join(test_path, "*.csv"))
test_dfs = []
for f in test_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    test_dfs.append(df)

test_df = pd.concat(test_dfs, ignore_index=True)
print("Test DataFrame shape:", test_df.shape)

# -------------------------------------
# Identify Target and Feature Columns
# -------------------------------------
target_cols = [f'interval_{i}_Arrival' for i in range(1, 13)]
exclude_cols = ['airport_id', 'ref_time'] + target_cols
all_cols = train_df.columns.tolist()
original_feature_cols = [c for c in all_cols if c not in exclude_cols]

# -------------------------------------
# Feature Engineering: Time Features
# -------------------------------------
def add_time_features(df):
    df['hour'] = df['ref_time'].dt.hour
    df['dayofweek'] = df['ref_time'].dt.dayofweek
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['dow_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
    return df

train_df = add_time_features(train_df)
test_df = add_time_features(test_df)

# -------------------------------------
# Add a simple lag feature: average arrivals in previous hour
# -------------------------------------
# We take the mean of all interval_X_Arrival from the previous row (assuming 1 row per hour)
# If multiple rows per hour exist in your data, consider grouping by hour. Adjust accordingly.
train_df = train_df.sort_values('ref_time')
# Shift all arrival intervals by 4 intervals if each interval is 15 min.
# If your data is hourly, adjust accordingly.
# Here we assume each row is 1-hour block based on the problem statement (please adapt if not).
lag_intervals = train_df[target_cols].shift(1)  # shift by 1 row (assumes 1 row per hour)
train_df['prev_hour_arrivals_mean'] = lag_intervals.mean(axis=1)

# For test data, do the same (no future info is leaked since we only shift)
test_df = test_df.sort_values('ref_time')
lag_intervals_test = test_df[target_cols].shift(1)
test_df['prev_hour_arrivals_mean'] = lag_intervals_test.mean(axis=1)

# Update feature_cols with the new feature
additional_features = ['hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'prev_hour_arrivals_mean']
feature_cols = original_feature_cols + additional_features

# -------------------------------------
# Prepare Training Data
# -------------------------------------
train_df = train_df.dropna(subset=target_cols)  # remove rows with missing targets
train_df[feature_cols] = train_df[feature_cols].fillna(-1)

# Sort by time
train_df = train_df.sort_values(by='ref_time')
train_fraction = 0.8
split_idx = int(len(train_df)*train_fraction)
tr = train_df.iloc[:split_idx]
va = train_df.iloc[split_idx:]

X_train_full = tr[feature_cols]
y_train_full = tr[target_cols]
X_valid = va[feature_cols]
y_valid = va[target_cols]

# We'll use a fraction of data for tuning
sample_fraction = 0.4
sample_size = int(len(X_train_full)*sample_fraction)
X_tune = X_train_full.iloc[:sample_size]
y_tune = y_train_full.iloc[:sample_size]

# -------------------------------------
# Define RMSE scorer
# -------------------------------------
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
rmse_scorer = make_scorer(rmse, greater_is_better=False)

tuning_target = 'interval_1_Arrival'

# -------------------------------------
# More thorough parameter search
# -------------------------------------
param_distributions = {
    'num_leaves': [31, 63, 127, 255],
    'max_depth': [-1, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_child_samples': [5, 20, 50, 100],
    'feature_fraction': [0.7, 0.8, 0.9, 1.0],
    'bagging_fraction': [0.7, 0.8, 0.9, 1.0],
    'lambda_l1': [0, 0.1, 1, 2],
    'lambda_l2': [0, 0.1, 1, 2]
}

# Use TimeSeriesSplit for time-based CV
tscv = TimeSeriesSplit(n_splits=3)
search_model = LGBMRegressor(n_estimators=150, random_state=42)

search = RandomizedSearchCV(
    estimator=search_model,
    param_distributions=param_distributions,
    n_iter=10,  # more iterations
    scoring=rmse_scorer,
    cv=tscv,     # time-based CV
    verbose=1,
    n_jobs=-1,
    random_state=42
)

search.fit(X_tune, y_tune[tuning_target])
best_params = search.best_params_
print("Best params found:", best_params)

# -------------------------------------
# Train Final Models with Best Params
# -------------------------------------
models = {}
for tcol in target_cols:
    print(f"Training final model for {tcol} with best params...")
    final_model = LGBMRegressor(
        n_estimators=600,  # More trees to fit now that we have good params
        random_state=42,
        verbose=-1,
        **best_params
    )
    final_model.fit(
        X_train_full, y_train_full[tcol],
        eval_set=[(X_valid, y_valid[tcol])],
        eval_metric='rmse',
        callbacks=[early_stopping(50), log_evaluation(100)]
    )
    models[tcol] = final_model

# Evaluate on validation set
val_preds = np.zeros(y_valid.shape)
for i, tcol in enumerate(target_cols):
    val_preds[:, i] = models[tcol].predict(X_valid)

val_rmse_score = rmse(y_valid.values, val_preds)
print("Validation RMSE after improved tuning and FE:", val_rmse_score)

# -------------------------------------
# Process Submission Format and Predict on Test
# -------------------------------------
submission_format = pd.read_csv(submission_format_file)

def parse_id(ID):
    parts = ID.split('_')
    airport = parts[0]
    date_str = parts[1]  # YYMMDD
    time_str = parts[2]  # HHMM
    offset = int(parts[3])
    base_dt = datetime.strptime(date_str+time_str, "%y%m%d%H%M")
    return airport, base_dt, offset

submission_format['parsed'] = submission_format['ID'].apply(parse_id)
grouped = submission_format.groupby(lambda i: (submission_format.at[i,'parsed'][0], submission_format.at[i,'parsed'][1]))

test_df[feature_cols] = test_df[feature_cols].fillna(-1)

id_to_pred = {}
for (airport, base_dt), indices in grouped.groups.items():
    start_time = base_dt - timedelta(hours=1)
    end_time = base_dt
    condition = (
        (test_df['airport_id'] == airport) &
        (test_df['ref_time'] >= start_time) &
        (test_df['ref_time'] < end_time)
    )
    hour_block = test_df[condition]

    if hour_block.empty:
        input_features = np.array([-1]*len(feature_cols)).reshape(1, -1)
    else:
        last_row = hour_block.iloc[-1]
        input_features = last_row[feature_cols].values.reshape(1, -1)

    preds = []
    for tcol in target_cols:
        p = models[tcol].predict(input_features)[0]
        preds.append(p)
    preds = np.array(preds)

    sub_rows = submission_format.loc[indices]
    for idx, row in sub_rows.iterrows():
        offset = row['parsed'][2]
        i = offset // 15
        pred_val = preds[i-1]
        id_to_pred[row['ID']] = pred_val

submission_format['Value'] = submission_format['ID'].map(id_to_pred)
submission_format['Value'] = submission_format['Value'].fillna(0)
submission_format['Value'] = submission_format['Value'].round().astype(int)
submission_format = submission_format[['ID', 'Value']]
submission_format.to_csv(output_submission_file, index=False)
print("Submission saved to", output_submission_file)


Mounted at /content/drive
Training DataFrame shape: (97034, 61)
Test DataFrame shape: (5280, 61)
Fitting 3 folds for each of 10 candidates, totalling 30 fits




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9879
[LightGBM] [Info] Number of data points in the train set: 31050, number of used features: 52
[LightGBM] [Info] Start training from score 31.608824
Best params found: {'num_leaves': 127, 'min_child_samples': 100, 'max_depth': -1, 'learning_rate': 0.1, 'lambda_l2': 1, 'lambda_l1': 0.1, 'feature_fraction': 0.7, 'bagging_fraction': 1.0}
Training final model for interval_1_Arrival with best params...
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 17.1684	valid_0's l2: 294.754
Early stopping, best iteration is:
[130]	valid_0's rmse: 17.1543	valid_0's l2: 294.271
Training final model for interval_2_Arrival with best params...
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 17.6362	valid_0's l2: 311.037
Early stopping, best iterati

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Submission saved to NASAsubmission_lightgbm_improved_V2.csv




# CatBoost Improved with Lag and Rolling feature, Tuned with parameters from Improved_V3

In [1]:
!pip install catboost --upgrade

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [4]:
#!/usr/bin/env python
# coding: utf-8

# If CatBoost is not installed, uncomment the following line:
# !pip install catboost --upgrade

from google.colab import drive
import pandas as pd
import glob
import os
import numpy as np
from catboost import CatBoostRegressor
from datetime import datetime, timedelta

# -------------------------------------
# Mount Google Drive (Adjust as needed)
# -------------------------------------
drive.mount('/content/drive')

# Adjust these paths according to your directory structure
train_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/train"
test_path = "drive/MyDrive/Colab Notebooks/dataset/FUSER/test"
submission_format_file = "drive/MyDrive/Colab Notebooks/dataset/FUSER/submission_format.csv"
output_submission_file = "NASAsubmission_CatBoost_Improved_V3_TunedV2.csv"

# -------------------------------------
# Load Training Data
# -------------------------------------
train_files = glob.glob(os.path.join(train_path, "*.csv"))
train_dfs = []
for f in train_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    train_dfs.append(df)

train_df = pd.concat(train_dfs, ignore_index=True)
print("Training DataFrame shape:", train_df.shape)

# -------------------------------------
# Load Test Data
# -------------------------------------
test_files = glob.glob(os.path.join(test_path, "*.csv"))
test_dfs = []
for f in test_files:
    df = pd.read_csv(f, parse_dates=['ref_time'])
    test_dfs.append(df)

test_df = pd.concat(test_dfs, ignore_index=True)
print("Test DataFrame shape:", test_df.shape)

# -------------------------------------
# Feature Engineering: Time-based features (optional, but often helpful)
# -------------------------------------
def add_time_features(df):
    # Extract basic time features
    df['hour'] = df['ref_time'].dt.hour
    df['day_of_week'] = df['ref_time'].dt.dayofweek
    df['month'] = df['ref_time'].dt.month

    # Cyclical encoding for hour_of_day
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24.0)

    # Cyclical encoding for day_of_week (0-6)
    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7.0)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7.0)
    return df

train_df = add_time_features(train_df)
test_df = add_time_features(test_df)

# -------------------------------------
# Add Lag and Rolling Features
# -------------------------------------
# We assume that data is recorded at 15-minute intervals.
# We'll create lag features for interval_1_Arrival for the last 4 intervals (1 hour).
# We'll also create rolling mean and std over the last hour of interval_1_Arrival.

def add_lag_rolling_features(df):
    df = df.sort_values(['airport_id', 'ref_time'])

    # Create lag features using transform
    g = df.groupby('airport_id')['interval_1_Arrival']
    df['lag_1_arr'] = g.transform(lambda x: x.shift(1))
    df['lag_2_arr'] = g.transform(lambda x: x.shift(2))
    df['lag_3_arr'] = g.transform(lambda x: x.shift(3))
    df['lag_4_arr'] = g.transform(lambda x: x.shift(4))

    # Rolling features using transform as well
    df['roll_1h_mean_arr'] = g.transform(lambda x: x.shift(1).rolling(4).mean())
    df['roll_1h_std_arr'] = g.transform(lambda x: x.shift(1).rolling(4).std())

    return df


train_df = add_lag_rolling_features(train_df)
test_df = add_lag_rolling_features(test_df)

# Fill missing values introduced by shifting and rolling
train_df.fillna(-1, inplace=True)
test_df.fillna(-1, inplace=True)

# -------------------------------------
# Identify Target and Feature Columns
# -------------------------------------
target_cols = [f'interval_{i}_Arrival' for i in range(1, 13)]
exclude_cols = ['airport_id', 'ref_time'] + target_cols
all_cols = train_df.columns.tolist()
feature_cols = [c for c in all_cols if c not in exclude_cols]

# Drop rows without target in training
train_df = train_df.dropna(subset=target_cols)

# Sort training data by time for time-based validation
train_df = train_df.sort_values(by='ref_time')

# -------------------------------------
# Prepare Training and Validation Sets
# -------------------------------------
train_fraction = 0.8
split_idx = int(len(train_df)*train_fraction)
tr = train_df.iloc[:split_idx]
va = train_df.iloc[split_idx:]

X_train = tr[feature_cols].copy()
y_train = tr[target_cols].copy()
X_valid = va[feature_cols].copy()
y_valid = va[target_cols].copy()

# Apply log transform to targets to stabilize variance
y_train_trans = np.log1p(y_train)
y_valid_trans = np.log1p(y_valid)

# -------------------------------------
# Train a Single CatBoost Model for All Targets
# -------------------------------------
cat_model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.015,  # changed from 0.03
    depth=6,
    l2_leaf_reg = 5,   # added
    eval_metric='MultiRMSE',
    random_seed=42,
    od_wait=100,       # changed
    use_best_model=True,
    verbose=50,
    loss_function='MultiRMSE'
)

cat_model.fit(
    X_train, y_train_trans,
    eval_set=(X_valid, y_valid_trans)
)

# Evaluate on validation set
val_preds_trans = cat_model.predict(X_valid)
val_preds = np.expm1(val_preds_trans)
val_rmse = np.sqrt(np.mean((y_valid.values - val_preds)**2))
print("Validation RMSE:", val_rmse)

# Compute the final score using the given transformation
K = 10
val_score = np.exp(-val_rmse / K)
print("Validation Score (exp(-RMSE/10)):", val_score)

# -------------------------------------
# Process Submission Format and Predict
# -------------------------------------
submission_format = pd.read_csv(submission_format_file)

def parse_id(ID):
    # Example: KDEN_220925_0100_15
    parts = ID.split('_')
    airport = parts[0]
    date_str = parts[1]  # YYMMDD
    time_str = parts[2]  # HHMM
    offset = int(parts[3])
    base_dt = datetime.strptime(date_str+time_str, "%y%m%d%H%M")
    return airport, base_dt, offset

submission_format['parsed'] = submission_format['ID'].apply(parse_id)
grouped = submission_format.groupby(lambda i: (submission_format.at[i,'parsed'][0], submission_format.at[i,'parsed'][1]))

test_df = test_df.sort_values(['airport_id','ref_time'])
test_df[feature_cols] = test_df[feature_cols].fillna(-1)

id_to_pred = {}

for (airport, base_dt), indices in grouped.groups.items():
    start_time = base_dt - timedelta(hours=1)
    end_time = base_dt

    condition = (
        (test_df['airport_id'] == airport) &
        (test_df['ref_time'] >= start_time) &
        (test_df['ref_time'] < end_time)
    )
    hour_block = test_df[condition]

    if hour_block.empty:
        input_features = np.array([-1]*len(feature_cols)).reshape(1, -1)
    else:
        # Take the last row of available data before prediction time
        last_row = hour_block.iloc[-1]
        input_features = last_row[feature_cols].values.reshape(1, -1)

    # Predict all intervals using the trained CatBoost model
    preds_trans = cat_model.predict(input_features)  # shape (1,12)
    preds_final = np.expm1(preds_trans)  # revert log transform

    sub_rows = submission_format.loc[indices]
    for idx, row in sub_rows.iterrows():
        offset = row['parsed'][2]
        # offset is multiple of 15 minutes: 15->interval_1,30->2,...180->12
        i = offset // 15
        pred_val = preds_final[0, i-1]
        pred_val = int(round(pred_val))
        id_to_pred[row['ID']] = pred_val

submission_format['Value'] = submission_format['ID'].map(id_to_pred)
submission_format['Value'] = submission_format['Value'].fillna(0)
submission_format['Value'] = submission_format['Value'].astype(int)

submission_format = submission_format[['ID', 'Value']]
submission_format.to_csv(output_submission_file, index=False)
print("Submission saved to", output_submission_file)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training DataFrame shape: (97034, 61)
Test DataFrame shape: (5280, 61)
0:	learn: 5.4061021	test: 5.6832526	best: 5.6832526 (0)	total: 117ms	remaining: 3m 53s
50:	learn: 3.6709446	test: 3.9712805	best: 3.9712805 (50)	total: 6.13s	remaining: 3m 54s
100:	learn: 2.9486859	test: 3.2505300	best: 3.2505300 (100)	total: 11.9s	remaining: 3m 43s
150:	learn: 2.6308228	test: 2.9546711	best: 2.9546711 (150)	total: 17.7s	remaining: 3m 36s
200:	learn: 2.4575516	test: 2.8317505	best: 2.8317505 (200)	total: 23.7s	remaining: 3m 31s
250:	learn: 2.3508281	test: 2.7647627	best: 2.7647627 (250)	total: 29.5s	remaining: 3m 25s
300:	learn: 2.2732608	test: 2.7053496	best: 2.7053496 (300)	total: 35.5s	remaining: 3m 20s
350:	learn: 2.2135555	test: 2.6665384	best: 2.6665384 (350)	total: 41.3s	remaining: 3m 13s
400:	learn: 2.1648081	test: 2.6431098	best: 2.6431098 (400)	total: 47.2s	remai