In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
import warnings

warnings.filterwarnings('ignore')

# ------------------ HELPER FUNCTIONS ------------------
def train_and_evaluate_models(X_train, y_train, X_test, y_test, target_name):
    """Trains and evaluates four regression models."""
    models = {
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
        'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
        'XGBoost': XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42),
        'Ridge': Ridge(alpha=1.0)
    }
    
    results = []
    for name, model in models.items():
        print(f"  Training {name} for {target_name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        
        results.append([name, r2, rmse, mse, mae])
    
    return results

# ------------------ MAIN SCRIPT FOR SCENARIO 3 (WITH NEW FEATURES) ------------------
print("--- Starting Analysis for Scenario 3: All Variables are Variable ---")
print("--- Training models on the corrected and updated data file ---")

# Step 1: Load the data
try:
    storm_df = pd.read_csv("scenario3_variable_50000.csv")
    storm_df['timestamp_utc'] = pd.to_datetime(storm_df['timestamp_utc'])
    print(f"Successfully loaded {len(storm_df)} records from 'scenario3_variable_50000.csv'.")
except FileNotFoundError:
    print("Error: 'scenario3_variable_50000.csv' not found. Please ensure the file is in the same directory.")
    exit()

# Step 2: Nowcasting-style Data Preparation (First 10 minutes)
initial_data_df = storm_df[storm_df['time_since_formation_hours'] <= 10/60]
print(f"Nowcasting dataset has {len(initial_data_df)} records (first 10 minutes of each storm).")

# Step 3: Advanced Feature Engineering
print("Performing advanced feature engineering on initial 10-minute data...")

# Calculate the slopes and second-order derivatives for each storm's initial phase
initial_data_df = initial_data_df.sort_values(by=['cell_id', 'time_since_formation_hours'])
advanced_features_df = initial_data_df.groupby('cell_id').apply(
    lambda group: pd.Series({
        # Simple slopes
        'intensity_slope_10min': (group['intensity_dbz'].iloc[-1] - group['intensity_dbz'].iloc[0]) / (group['time_since_formation_hours'].iloc[-1] - group['time_since_formation_hours'].iloc[0]) if len(group) > 1 and group['time_since_formation_hours'].iloc[-1] != group['time_since_formation_hours'].iloc[0] else 0,
        'size_slope_10min': (group['size_pixels'].iloc[-1] - group['size_pixels'].iloc[0]) / (group['time_since_formation_hours'].iloc[-1] - group['time_since_formation_hours'].iloc[0]) if len(group) > 1 and group['time_since_formation_hours'].iloc[-1] != group['time_since_formation_hours'].iloc[0] else 0,
        'rainfall_slope_10min': (group['rainfall_mm_per_hr'].iloc[-1] - group['rainfall_mm_per_hr'].iloc[0]) / (group['time_since_formation_hours'].iloc[-1] - group['time_since_formation_hours'].iloc[0]) if len(group) > 1 and group['time_since_formation_hours'].iloc[-1] != group['time_since_formation_hours'].iloc[0] else 0,
        
        # Second-order derivative (change in slope)
        'intensity_accel_10min': group['intensity_change_rate'].diff().sum(),
        'size_accel_10min': group['size_pixels'].diff().sum(),
        'rainfall_accel_10min': group['rainfall_mm_per_hr'].diff().sum(),
        
        # Statistical measures
        'intensity_std_10min': group['intensity_dbz'].std(),
        'size_std_10min': group['size_pixels'].std(),
        'rainfall_std_10min': group['rainfall_mm_per_hr'].std(),
        
        # Cumulative metrics (cumulative sum over time)
        'cumulative_intensity_10min': group['intensity_dbz'].sum() * (5/60),
        'cumulative_rainfall_10min': group['rainfall_mm_per_hr'].sum() * (5/60)
    })
).reset_index()

# Now, perform the rest of the feature engineering using simple aggregations
engineered_features_stats = initial_data_df.groupby('cell_id').agg(
    initial_intensity_dbz=('intensity_dbz', 'first'),
    initial_size_pixels=('size_pixels', 'first'),
    initial_rainfall_mm_per_hr=('rainfall_mm_per_hr', 'first'),
    max_intensity_10min=('intensity_dbz', 'max'),
    max_size_10min=('size_pixels', 'max'),
    max_rainfall_10min=('rainfall_mm_per_hr', 'max'),
    mean_intensity_10min=('intensity_dbz', 'mean'),
    mean_size_10min=('size_pixels', 'mean'),
    mean_rainfall_10min=('rainfall_mm_per_hr', 'mean'),
).reset_index()

# Merge the stats and slopes
engineered_features = pd.merge(engineered_features_stats, advanced_features_df, on='cell_id', how='left')
print("Feature engineering complete.")

# Step 4: Aggregate targets from the full dataset
targets_df = storm_df.groupby('cell_id').agg(
    lifetime_hours=('lifetime_hours', 'first'),
    peak_rainfall_mmhr=('rainfall_mm_per_hr', 'max'),
    total_rainfall_mm=('rainfall_mm_per_hr', lambda x: (x * (5/60)).sum())
).reset_index()

# Step 5: Merge features with targets
dataset = pd.merge(engineered_features, targets_df, on='cell_id', how='inner')
dataset.dropna(inplace=True)
print(f"Final dataset for training/testing has {len(dataset)} unique storms.")

# Step 6: Split the data
cell_ids = dataset['cell_id'].unique()
train_ids, test_ids = train_test_split(cell_ids, test_size=20000, train_size=30000, random_state=42, shuffle=True)

train_df = dataset[dataset['cell_id'].isin(train_ids)]
test_df = dataset[dataset['cell_id'].isin(test_ids)]

X_train = train_df.drop(['cell_id', 'lifetime_hours', 'peak_rainfall_mmhr', 'total_rainfall_mm'], axis=1)
y_train_lifetime = train_df['lifetime_hours']
y_train_peak_rainfall = train_df['peak_rainfall_mmhr']
y_train_total_rainfall = train_df['total_rainfall_mm']

X_test = test_df.drop(['cell_id', 'lifetime_hours', 'peak_rainfall_mmhr', 'total_rainfall_mm'], axis=1)
y_test_lifetime = test_df['lifetime_hours']
y_test_peak_rainfall = test_df['peak_rainfall_mmhr']
y_test_total_rainfall = test_df['total_rainfall_mm']

# Step 7: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Features have been scaled using StandardScaler.")

# Step 8: Train and Evaluate models for each target
print("\n--- Model Training & Evaluation for Scenario 3 ---")
all_results = {}
all_results['Lifetime Hours'] = train_and_evaluate_models(X_train_scaled, y_train_lifetime, X_test_scaled, y_test_lifetime, 'Lifetime Hours')
all_results['Peak Rainfall'] = train_and_evaluate_models(X_train_scaled, y_train_peak_rainfall, X_test_scaled, y_test_peak_rainfall, 'Peak Rainfall')
all_results['Total Rainfall'] = train_and_evaluate_models(X_train_scaled, y_train_total_rainfall, X_test_scaled, y_test_total_rainfall, 'Total Rainfall')

# Step 9: Print results in a single table
print("\n" + "="*80)
print("Performance Metrics for Scenario 3 (Nowcasting from 10 min data) with ADVANCED FEATURES")
print("="*80)
for target, results in all_results.items():
    print(f"\nTarget: {target}")
    headers = ["Model", "R-squared", "RMSE", "MSE", "MAE"]
    print(tabulate(results, headers=headers, tablefmt="grid", floatfmt=".4f"))

--- Starting Analysis for Scenario 3: All Variables are Variable ---
--- Training models on the corrected and updated data file ---
Successfully loaded 433021 records from 'scenario3_variable_50000.csv'.
Nowcasting dataset has 150000 records (first 10 minutes of each storm).
Performing advanced feature engineering on initial 10-minute data...
Feature engineering complete.
Final dataset for training/testing has 50000 unique storms.
Features have been scaled using StandardScaler.

--- Model Training & Evaluation for Scenario 3 ---
  Training Gradient Boosting for Lifetime Hours...
  Training Random Forest for Lifetime Hours...
  Training XGBoost for Lifetime Hours...
  Training Ridge for Lifetime Hours...
  Training Gradient Boosting for Peak Rainfall...
  Training Random Forest for Peak Rainfall...
  Training XGBoost for Peak Rainfall...
  Training Ridge for Peak Rainfall...
  Training Gradient Boosting for Total Rainfall...
  Training Random Forest for Total Rainfall...
  Training XGBo

# Version 2

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
import warnings

warnings.filterwarnings('ignore')

# ------------------ HELPER FUNCTIONS ------------------
def train_and_evaluate_models(X_train, y_train, X_test, y_test, target_name):
    """Trains and evaluates four regression models."""
    models = {
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
        'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
        'XGBoost': XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42),
        'Ridge': Ridge(alpha=1.0)
    }
    
    results = []
    for name, model in models.items():
        print(f"  Training {name} for {target_name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        
        results.append([name, r2, rmse, mse, mae])
    
    return results

# ------------------ MAIN SCRIPT FOR SCENARIO 3 (WITH NEW FEATURES) ------------------
print("--- Starting Analysis for Scenario 3: All Variables are Variable ---")
print("--- Training models on the corrected and updated data file ---")

# Step 1: Load the data
try:
    storm_df = pd.read_csv("scenario3_variable_50000.csv")
    storm_df['timestamp_utc'] = pd.to_datetime(storm_df['timestamp_utc'])
    print(f"Successfully loaded {len(storm_df)} records from 'scenario3_variable_50000.csv'.")
except FileNotFoundError:
    print("Error: 'scenario3_variable_50000.csv' not found. Please ensure the file is in the same directory.")
    exit()

# Step 2: Nowcasting-style Data Preparation (First 10 minutes)
initial_data_df = storm_df[storm_df['time_since_formation_hours'] <= 10/60]
print(f"Nowcasting dataset has {len(initial_data_df)} records (first 10 minutes of each storm).")

# Step 3: Advanced Feature Engineering
print("Performing advanced feature engineering on initial 10-minute data...")

# Calculate the slopes and second-order derivatives for each storm's initial phase
initial_data_df = initial_data_df.sort_values(by=['cell_id', 'time_since_formation_hours'])
advanced_features_df = initial_data_df.groupby('cell_id').apply(
    lambda group: pd.Series({
        # Simple slopes
        'intensity_slope_10min': (group['intensity_dbz'].iloc[-1] - group['intensity_dbz'].iloc[0]) / (group['time_since_formation_hours'].iloc[-1] - group['time_since_formation_hours'].iloc[0]) if len(group) > 1 and group['time_since_formation_hours'].iloc[-1] != group['time_since_formation_hours'].iloc[0] else 0,
        'size_slope_10min': (group['size_pixels'].iloc[-1] - group['size_pixels'].iloc[0]) / (group['time_since_formation_hours'].iloc[-1] - group['time_since_formation_hours'].iloc[0]) if len(group) > 1 and group['time_since_formation_hours'].iloc[-1] != group['time_since_formation_hours'].iloc[0] else 0,
        'rainfall_slope_10min': (group['rainfall_mm_per_hr'].iloc[-1] - group['rainfall_mm_per_hr'].iloc[0]) / (group['time_since_formation_hours'].iloc[-1] - group['time_since_formation_hours'].iloc[0]) if len(group) > 1 and group['time_since_formation_hours'].iloc[-1] != group['time_since_formation_hours'].iloc[0] else 0,
        
        # Second-order derivative (change in slope)
        'intensity_accel_10min': group['intensity_change_rate'].diff().sum(),
        'size_accel_10min': group['size_pixels'].diff().sum(),
        'rainfall_accel_10min': group['rainfall_mm_per_hr'].diff().sum(),
        
        # Statistical measures
        'intensity_std_10min': group['intensity_dbz'].std(),
        'size_std_10min': group['size_pixels'].std(),
        'rainfall_std_10min': group['rainfall_mm_per_hr'].std(),
        
        # Cumulative metrics (cumulative sum over time)
        'cumulative_intensity_10min': group['intensity_dbz'].sum() * (5/60),
        'cumulative_rainfall_10min': group['rainfall_mm_per_hr'].sum() * (5/60)
    })
).reset_index()

# Now, perform the rest of the feature engineering using simple aggregations
engineered_features_stats = initial_data_df.groupby('cell_id').agg(
    initial_intensity_dbz=('intensity_dbz', 'first'),
    initial_size_pixels=('size_pixels', 'first'),
    initial_rainfall_mm_per_hr=('rainfall_mm_per_hr', 'first'),
    max_intensity_10min=('intensity_dbz', 'max'),
    max_size_10min=('size_pixels', 'max'),
    max_rainfall_10min=('rainfall_mm_per_hr', 'max'),
    mean_intensity_10min=('intensity_dbz', 'mean'),
    mean_size_10min=('size_pixels', 'mean'),
    mean_rainfall_10min=('rainfall_mm_per_hr', 'mean'),
).reset_index()

# Merge the stats and slopes
engineered_features = pd.merge(engineered_features_stats, advanced_features_df, on='cell_id', how='left')
print("Feature engineering complete.")

# Step 4: Aggregate targets from the full dataset
targets_df = storm_df.groupby('cell_id').agg(
    lifetime_hours=('lifetime_hours', 'first'),
    peak_rainfall_mmhr=('rainfall_mm_per_hr', 'max'),
    total_rainfall_mm=('rainfall_mm_per_hr', lambda x: (x * (5/60)).sum())
).reset_index()

# Step 5: Merge features with targets
dataset = pd.merge(engineered_features, targets_df, on='cell_id', how='inner')
dataset.dropna(inplace=True)
print(f"Final dataset for training/testing has {len(dataset)} unique storms.")

# Step 6: Split the data
cell_ids = dataset['cell_id'].unique()
train_ids, test_ids = train_test_split(cell_ids, test_size=20000, train_size=30000, random_state=42, shuffle=True)

train_df = dataset[dataset['cell_id'].isin(train_ids)]
test_df = dataset[dataset['cell_id'].isin(test_ids)]

X_train = train_df.drop(['cell_id', 'lifetime_hours', 'peak_rainfall_mmhr', 'total_rainfall_mm'], axis=1)
y_train_lifetime = train_df['lifetime_hours']
y_train_peak_rainfall = train_df['peak_rainfall_mmhr']
y_train_total_rainfall = train_df['total_rainfall_mm']

X_test = test_df.drop(['cell_id', 'lifetime_hours', 'peak_rainfall_mmhr', 'total_rainfall_mm'], axis=1)
y_test_lifetime = test_df['lifetime_hours']
y_test_peak_rainfall = test_df['peak_rainfall_mmhr']
y_test_total_rainfall = test_df['total_rainfall_mm']

# Step 7: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Features have been scaled using StandardScaler.")

# Step 8: Train and Evaluate models for each target
print("\n--- Model Training & Evaluation for Scenario 3 ---")
all_results = {}
all_results['Lifetime Hours'] = train_and_evaluate_models(X_train_scaled, y_train_lifetime, X_test_scaled, y_test_lifetime, 'Lifetime Hours')
all_results['Peak Rainfall'] = train_and_evaluate_models(X_train_scaled, y_train_peak_rainfall, X_test_scaled, y_test_peak_rainfall, 'Peak Rainfall')
all_results['Total Rainfall'] = train_and_evaluate_models(X_train_scaled, y_train_total_rainfall, X_test_scaled, y_test_total_rainfall, 'Total Rainfall')

# Step 9: Print results in a single table
print("\n" + "="*80)
print("Performance Metrics for Scenario 3 (Nowcasting from 10 min data) with ADVANCED FEATURES")
print("="*80)
for target, results in all_results.items():
    print(f"\nTarget: {target}")
    headers = ["Model", "R-squared", "RMSE", "MSE", "MAE"]
    print(tabulate(results, headers=headers, tablefmt="grid", floatfmt=".4f"))

--- Starting Analysis for Scenario 3: All Variables are Variable ---
--- Training models on the corrected and updated data file ---
Successfully loaded 433021 records from 'scenario3_variable_50000.csv'.
Nowcasting dataset has 150000 records (first 10 minutes of each storm).
Performing advanced feature engineering on initial 10-minute data...
Feature engineering complete.
Final dataset for training/testing has 50000 unique storms.
Features have been scaled using StandardScaler.

--- Model Training & Evaluation for Scenario 3 ---
  Training Gradient Boosting for Lifetime Hours...
  Training Random Forest for Lifetime Hours...
  Training XGBoost for Lifetime Hours...
  Training Ridge for Lifetime Hours...
  Training Gradient Boosting for Peak Rainfall...
  Training Random Forest for Peak Rainfall...
  Training XGBoost for Peak Rainfall...
  Training Ridge for Peak Rainfall...
  Training Gradient Boosting for Total Rainfall...
  Training Random Forest for Total Rainfall...
  Training XGBo