In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import gc

# ===================================================================
# 1. Load and Prepare Data
# ===================================================================
print("Loading data...")
df = pd.read_csv("/content/normalized_features.csv")

df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df = df.sort_values(by=['Location', 'Timestamp']).reset_index(drop=True)

# ==> CHANGE 1: Apply log transformation to the target variable 'out'
# We create a new column for the transformed target to keep the original for validation.
df['out_transformed'] = np.log1p(df['out'])

le = LabelEncoder()
df['Location_encoded'] = le.fit_transform(df['Location'])


# ===================================================================
# 2. Feature Engineering for Time-Series
# ===================================================================
print("Engineering features...")

def create_time_series_features(df_in):
    df_out = df_in.copy()
    features_to_lag = ['out', 't2m', 'gust', 'cape', 'prate']
    lag_periods = [1, 2, 3, 24, 48]

    for feature in features_to_lag:
        for lag in lag_periods:
            df_out[f'{feature}_lag_{lag}'] = df_out.groupby('Location')[feature].shift(lag)

    df_out['gust_roll_max_3h'] = df_out.groupby('Location')['gust'].transform(
        lambda x: x.shift(1).rolling(window=3).max()
    )
    df_out['hour'] = df_out['Timestamp'].dt.hour
    df_out['dayofweek'] = df_out['Timestamp'].dt.dayofweek
    return df_out

df_features = create_time_series_features(df)

features = [col for col in df_features.columns if col not in [
    'out', 'out_transformed', 'Timestamp', 'Location'
]]


# ===================================================================
# 3. Create a Time-Based Validation Split
# ===================================================================
print("Creating time-based train/validation split...")
validation_start_date = df_features['Timestamp'].max() - pd.Timedelta(days=7)

train_df = df_features[df_features['Timestamp'] < validation_start_date].copy()
val_df = df_features[df_features['Timestamp'] >= validation_start_date].copy()

train_df.dropna(inplace=True)


# ===================================================================
# 4. Train Models with Direct Forecasting Strategy
# ===================================================================
print("Starting model training...")
MAX_HORIZON = 48
models = {}

for h in range(1, MAX_HORIZON + 1):
    print(f"\n===== Training model for horizon t+{h} =====")

    # Use the TRANSFORMED 'out' column as the target
    y_train_h = train_df.groupby('Location')['out_transformed'].shift(-h)
    X_train_h = train_df[features]

    valid_indices = ~y_train_h.isna()
    X_train_h, y_train_h = X_train_h[valid_indices], y_train_h[valid_indices]

    # Create validation set for this horizon
    X_val_h = val_df[features]
    y_val_h = val_df.groupby('Location')['out_transformed'].shift(-h).dropna()
    X_val_h = X_val_h.loc[y_val_h.index]

    # ==> CHANGE 2: Objective is 'regression_l2' (RMSE)
    lgbm = lgb.LGBMRegressor(
        objective='regression_l2',
        n_estimators=2000,
        learning_rate=0.01,
        num_leaves=40,
        random_state=42,
        n_jobs=-1,
        colsample_bytree=0.7,
        subsample=0.7,
    )

    lgbm.fit(
        X_train_h, y_train_h,
        eval_set=[(X_val_h, y_val_h)],
        eval_metric='rmse',
        # ==> CHANGE 3: Callback will now print the loss as it improves
        callbacks=[lgb.early_stopping(100, verbose=True)]
    )

    models[h] = lgbm
    gc.collect()

print("\nModel training complete.")


# ===================================================================
# 5. Generate and Evaluate Predictions
# ===================================================================
print("Generating and evaluating predictions on the validation set...")

validation_predictions = []
for loc_id in val_df['Location'].unique():
    loc_df = val_df[val_df['Location'] == loc_id].copy()

    # Use the features from the first timestamp of the validation set for this location
    X_pred_loc = loc_df[features].iloc[[0]]

    for h in range(1, MAX_HORIZON + 1):
        if h > len(loc_df): continue # Cannot validate if horizon is longer than remaining data

        prediction_transformed = models[h].predict(X_pred_loc)[0]

        # ==> CHANGE 4: Reverse the transformation using expm1
        prediction_original_scale = np.expm1(prediction_transformed)
        prediction_original_scale = max(0, prediction_original_scale)

        # Get the true value from the original 'out' column
        true_value = loc_df['out'].iloc[h-1]

        validation_predictions.append({
            'pred': prediction_original_scale,
            'true': true_value
        })

final_preds_df = pd.DataFrame(validation_predictions)
rmse = np.sqrt(mean_squared_error(final_preds_df['true'], final_preds_df['pred']))
print(f"\n✅ Final Validation RMSE on original scale: {rmse:.4f}")

Loading data...
Engineering features...
Creating time-based train/validation split...
Starting model training...

===== Training model for horizon t+1 =====
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[691]	valid_0's rmse: 1.10068	valid_0's l2: 1.21149

===== Training model for horizon t+2 =====
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[760]	valid_0's rmse: 1.22217	valid_0's l2: 1.49371

===== Training model for horizon t+3 =====
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[606]	valid_0's rmse: 1.30131	valid_0's l2: 1.69341

===== Training model for horizon t+4 =====
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[869]	valid_0's rmse: 1.35171	valid_0's l2: 1.82711

===== Training model for horizon t+5 =====
Training until validation scores don't improve for 100 rounds
Early sto

In [None]:

# ===================================================================
# 4. Generate Predictions for Submission
# ===================================================================
print("Generating predictions for submission files...")

# Get the last row of data for each location from the full feature-engineered dataframe.
# This row contains the most recent lagged features needed for prediction.
last_known_data = df_features.groupby('Location').last().reset_index()

# Load the submission templates to get the required timestamps and locations.
sub_template_24h = pd.read_csv('/content/submission_template_24h.csv')
sub_template_48h = pd.read_csv('/content/submission_template_48h.csv')

sub_template_24h['timestamp'] = pd.to_datetime(sub_template_24h['timestamp'])
sub_template_48h['timestamp'] = pd.to_datetime(sub_template_48h['timestamp'])


all_predictions = []

for loc_encoded, loc_id in enumerate(le.classes_):
    # Get the feature set for the current location
    X_pred_loc = last_known_data[last_known_data['Location'] == loc_id][features]

    # If a location has no data, we can't predict
    if X_pred_loc.empty:
        continue

    for h in range(1, MAX_HORIZON + 1):
        # Predict using the model for horizon 'h'
        prediction = models[h].predict(X_pred_loc)[0]

        # Ensure prediction is non-negative
        prediction = max(0, prediction)

        # The prediction timestamp is h hours after the last known timestamp
        pred_timestamp = last_known_data.loc[last_known_data['Location'] == loc_id, 'Timestamp'].iloc[0] + pd.Timedelta(hours=h)

        all_predictions.append({
            'timestamp': pred_timestamp,
            'location': loc_id,
            'pred': prediction
        })

predictions_df = pd.DataFrame(all_predictions)


# ===================================================================
# 5. Create and Save Submission Files
# ===================================================================
print("Saving submission files...")

# Create 24-hour submission file
submission_24h = sub_template_24h[['timestamp', 'location']].copy()
submission_24h = submission_24h.merge(predictions_df, on=['timestamp', 'location'], how='left')
submission_24h['pred'] = submission_24h['pred'].fillna(0) # Fill any misses with 0

# Create 48-hour submission file
submission_48h = sub_template_48h[['timestamp', 'location']].copy()
submission_48h = submission_48h.merge(predictions_df, on=['timestamp', 'location'], how='left')
submission_48h['pred'] = submission_48h['pred'].fillna(0) # Fill any misses with 0


# Save to CSV in the specified format without the index
submission_24h.to_csv('submission_24h.csv', index=False)
submission_48h.to_csv('submission_48h.csv', index=False)

print(f"✅ Successfully created submission_24h.csv and submission_48h.csv")

Generating predictions for submission files...


  sub_template_24h['timestamp'] = pd.to_datetime(sub_template_24h['timestamp'])
  sub_template_48h['timestamp'] = pd.to_datetime(sub_template_48h['timestamp'])


Saving submission files...
✅ Successfully created submission_24h.csv and submission_48h.csv


In [None]:
import joblib

joblib.dump(models, 'lgbm_direct_forecasting_models.joblib')
print("✅ Models have been saved to lgbm_direct_forecasting_models.joblib")

✅ Models have been saved to lgbm_direct_forecasting_models.joblib


TRYING TO IMPROVE RMSE EVEN FURTHER

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import gc

# ===================================================================
# 1. Load and Prepare Data
# ===================================================================
print("Loading data...")
df = pd.read_csv("/content/normalized_features.csv")

df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df = df.sort_values(by=['Location', 'Timestamp']).reset_index(drop=True)

# ==> CHANGE 1: Apply log transformation to the target variable 'out'
# We create a new column for the transformed target to keep the original for validation.
df['out_transformed'] = np.log1p(df['out'])

le = LabelEncoder()
df['Location_encoded'] = le.fit_transform(df['Location'])


# ===================================================================
# 2. Feature Engineering for Time-Series
# ===================================================================
print("Engineering features...")

def create_time_series_features(df_in):
    df_out = df_in.copy()
    features_to_lag = ['out', 't2m', 'gust', 'cape', 'prate']
    lag_periods = [1, 2, 3, 24, 48]

    for feature in features_to_lag:
        for lag in lag_periods:
            df_out[f'{feature}_lag_{lag}'] = df_out.groupby('Location')[feature].shift(lag)

    df_out['gust_roll_max_3h'] = df_out.groupby('Location')['gust'].transform(
        lambda x: x.shift(1).rolling(window=3).max()
    )
    df_out['hour'] = df_out['Timestamp'].dt.hour
    df_out['dayofweek'] = df_out['Timestamp'].dt.dayofweek
    return df_out

df_features = create_time_series_features(df)

features = [col for col in df_features.columns if col not in [
    'out', 'out_transformed', 'Timestamp', 'Location'
]]


# ===================================================================
# 3. Create a Time-Based Validation Split
# ===================================================================
print("Creating time-based train/validation split...")
validation_start_date = df_features['Timestamp'].max() - pd.Timedelta(days=7)

train_df = df_features[df_features['Timestamp'] < validation_start_date].copy()
val_df = df_features[df_features['Timestamp'] >= validation_start_date].copy()

train_df.dropna(inplace=True)


# ===================================================================
# 4. Train Models with Direct Forecasting Strategy
# ===================================================================
print("Starting model training...")
MAX_HORIZON = 48
models = {}

for h in range(1, MAX_HORIZON + 1):
    print(f"\n===== Training model for horizon t+{h} =====")

    # Use the TRANSFORMED 'out' column as the target
    y_train_h = train_df.groupby('Location')['out_transformed'].shift(-h)
    X_train_h = train_df[features]

    valid_indices = ~y_train_h.isna()
    X_train_h, y_train_h = X_train_h[valid_indices], y_train_h[valid_indices]

    # Create validation set for this horizon
    X_val_h = val_df[features]
    y_val_h = val_df.groupby('Location')['out_transformed'].shift(-h).dropna()
    X_val_h = X_val_h.loc[y_val_h.index]

    # ==> CHANGE 2: Objective is 'regression_l2' (RMSE)
    lgbm = lgb.LGBMRegressor(
        objective='regression_l2',
        n_estimators=3000,
        learning_rate=0.01,
        num_leaves=60,
        random_state=42,
        n_jobs=-1,
        colsample_bytree=0.7,
        subsample=0.6,
    )

    lgbm.fit(
        X_train_h, y_train_h,
        eval_set=[(X_val_h, y_val_h)],
        eval_metric='rmse',
        # ==> CHANGE 3: Callback will now print the loss as it improves
        callbacks=[lgb.early_stopping(100, verbose=True)]
    )

    models[h] = lgbm
    gc.collect()

print("\nModel training complete.")


# ===================================================================
# 5. Generate and Evaluate Predictions
# ===================================================================
print("Generating and evaluating predictions on the validation set...")

validation_predictions = []
for loc_id in val_df['Location'].unique():
    loc_df = val_df[val_df['Location'] == loc_id].copy()

    # Use the features from the first timestamp of the validation set for this location
    X_pred_loc = loc_df[features].iloc[[0]]

    for h in range(1, MAX_HORIZON + 1):
        if h > len(loc_df): continue # Cannot validate if horizon is longer than remaining data

        prediction_transformed = models[h].predict(X_pred_loc)[0]

        # ==> CHANGE 4: Reverse the transformation using expm1
        prediction_original_scale = np.expm1(prediction_transformed)
        prediction_original_scale = max(0, prediction_original_scale)

        # Get the true value from the original 'out' column
        true_value = loc_df['out'].iloc[h-1]

        validation_predictions.append({
            'pred': prediction_original_scale,
            'true': true_value
        })

final_preds_df = pd.DataFrame(validation_predictions)
rmse = np.sqrt(mean_squared_error(final_preds_df['true'], final_preds_df['pred']))
print(f"\n✅ Final Validation RMSE on original scale: {rmse:.4f}")

Loading data...
Engineering features...
Creating time-based train/validation split...
Starting model training...

===== Training model for horizon t+1 =====
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.081791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24943
[LightGBM] [Info] Number of data points in the train set: 161269, number of used features: 116
[LightGBM] [Info] Start training from score 0.673712
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[679]	valid_0's rmse: 1.10282	valid_0's l2: 1.21621

===== Training model for horizon t+2 =====
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.063631 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24943
[LightGBM] [Info] Number of data points in t

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import gc

# ===================================================================
# 1. Load and Prepare Data
# ===================================================================
print("Loading data...")
df = pd.read_csv("/content/train.csv")

df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df = df.sort_values(by=['Location', 'Timestamp']).reset_index(drop=True)

# ==> CHANGE 1: Apply log transformation to the target variable 'out'
# We create a new column for the transformed target to keep the original for validation.
df['out_transformed'] = np.log1p(df['out'])

le = LabelEncoder()
df['Location_encoded'] = le.fit_transform(df['Location'])


# ===================================================================
# 2. Feature Engineering for Time-Series
# ===================================================================
print("Engineering features...")

def create_time_series_features(df_in):
    df_out = df_in.copy()
    features_to_lag = ['out', 't2m', 'gust', 'cape', 'prate']
    lag_periods = [1, 2, 3, 24, 48]

    for feature in features_to_lag:
        for lag in lag_periods:
            df_out[f'{feature}_lag_{lag}'] = df_out.groupby('Location')[feature].shift(lag)

    df_out['gust_roll_max_3h'] = df_out.groupby('Location')['gust'].transform(
        lambda x: x.shift(1).rolling(window=3).max()
    )
    df_out['hour'] = df_out['Timestamp'].dt.hour
    df_out['dayofweek'] = df_out['Timestamp'].dt.dayofweek
    return df_out

df_features = create_time_series_features(df)

features = [col for col in df_features.columns if col not in [
    'out', 'out_transformed', 'Timestamp', 'Location'
]]


# ===================================================================
# 3. Create a Time-Based Validation Split
# ===================================================================
print("Creating time-based train/validation split...")
validation_start_date = df_features['Timestamp'].max() - pd.Timedelta(days=7)

train_df = df_features[df_features['Timestamp'] < validation_start_date].copy()
val_df = df_features[df_features['Timestamp'] >= validation_start_date].copy()

train_df.dropna(inplace=True)


# ===================================================================
# 4. Train Models with Direct Forecasting Strategy
# ===================================================================
print("Starting model training...")
MAX_HORIZON = 48
models = {}

for h in range(1, MAX_HORIZON + 1):
    print(f"\n===== Training model for horizon t+{h} =====")

    # Use the TRANSFORMED 'out' column as the target
    y_train_h = train_df.groupby('Location')['out_transformed'].shift(-h)
    X_train_h = train_df[features]

    valid_indices = ~y_train_h.isna()
    X_train_h, y_train_h = X_train_h[valid_indices], y_train_h[valid_indices]

    # Create validation set for this horizon
    X_val_h = val_df[features]
    y_val_h = val_df.groupby('Location')['out_transformed'].shift(-h).dropna()
    X_val_h = X_val_h.loc[y_val_h.index]

    # ==> CHANGE 2: Objective is 'regression_l2' (RMSE)
    lgbm = lgb.LGBMRegressor(
        objective='regression_l2',
        n_estimators=3000,
        learning_rate=0.01,
        num_leaves=60,
        random_state=42,
        n_jobs=-1,
        colsample_bytree=0.7,
        subsample=0.6,
    )

    lgbm.fit(
        X_train_h, y_train_h,
        eval_set=[(X_val_h, y_val_h)],
        eval_metric='rmse',
        # ==> CHANGE 3: Callback will now print the loss as it improves
        callbacks=[lgb.early_stopping(100, verbose=True)]
    )

    models[h] = lgbm
    gc.collect()

print("\nModel training complete.")


# ===================================================================
# 5. Generate and Evaluate Predictions
# ===================================================================
print("Generating and evaluating predictions on the validation set...")

validation_predictions = []
for loc_id in val_df['Location'].unique():
    loc_df = val_df[val_df['Location'] == loc_id].copy()

    # Use the features from the first timestamp of the validation set for this location
    X_pred_loc = loc_df[features].iloc[[0]]

    for h in range(1, MAX_HORIZON + 1):
        if h > len(loc_df): continue # Cannot validate if horizon is longer than remaining data

        prediction_transformed = models[h].predict(X_pred_loc)[0]

        # ==> CHANGE 4: Reverse the transformation using expm1
        prediction_original_scale = np.expm1(prediction_transformed)
        prediction_original_scale = max(0, prediction_original_scale)

        # Get the true value from the original 'out' column
        true_value = loc_df['out'].iloc[h-1]

        validation_predictions.append({
            'pred': prediction_original_scale,
            'true': true_value
        })

final_preds_df = pd.DataFrame(validation_predictions)
rmse = np.sqrt(mean_squared_error(final_preds_df['true'], final_preds_df['pred']))
print(f"\n✅ Final Validation RMSE on original scale: {rmse:.4f}")

Loading data...
Engineering features...
Creating time-based train/validation split...
Starting model training...

===== Training model for horizon t+1 =====
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059888 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25089
[LightGBM] [Info] Number of data points in the train set: 161269, number of used features: 116
[LightGBM] [Info] Start training from score 0.673712
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[676]	valid_0's rmse: 1.10536	valid_0's l2: 1.22183

===== Training model for horizon t+2 =====
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25089
[LightGBM] [Info] Number of data points in the train set: 161186, number of used features: 116
[LightGBM] [I