In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')

print('Phase 8 Libraries loaded')

Phase 8 Libraries loaded


## Load Engineered Features

In [2]:
file_path = 'd:/S2/prediksi - hujan/engineered_features_dataset.csv'
df = pd.read_csv(file_path, index_col=0, parse_dates=True)

print(f'Engineered features loaded: {df.shape}')
print(f'Date range: {df.index.min().date()} to {df.index.max().date()}')
print(f'Missing values: {df.isna().sum().sum()} total')

# Show which features have NaN
nan_features = df.isna().sum()
print(f'\nFeatures with NaN (>0):')
print(nan_features[nan_features > 0])

Engineered features loaded: (1827, 82)
Date range: 2020-01-01 to 2024-12-31
Missing values: 250 total

Features with NaN (>0):
tp_lag1                  1
tp_lag2                  2
tp_lag3                  3
tp_lag6                  6
tp_lag7                  7
tp_lag14                14
ro_lag1                  1
ro_lag2                  2
ro_lag3                  3
ro_lag6                  6
ro_lag7                  7
ro_lag14                14
t2m_lag1                 1
t2m_lag2                 2
t2m_lag3                 3
t2m_lag6                 6
t2m_lag7                 7
t2m_lag14               14
u10_lag1                 1
u10_lag2                 2
u10_lag3                 3
u10_lag6                 6
u10_lag7                 7
u10_lag14               14
v10_lag1                 1
v10_lag2                 2
v10_lag3                 3
v10_lag6                 6
v10_lag7                 7
v10_lag14               14
swvl1_lag1               1
swvl1_lag2               2
swvl1_lag

## Step 1: Handle Missing Values
Drop rows with NaN from lagged features (first 14 days)

In [3]:
print(f'Before dropping NaN: {df.shape}')
df_clean = df.dropna()
print(f'After dropping NaN: {df_clean.shape}')
print(f'Rows removed: {df.shape[0] - df_clean.shape[0]}')
print(f'Effective data: {df_clean.shape[0]} days (from 2020-01-15 onwards)')

Before dropping NaN: (1827, 82)
After dropping NaN: (1813, 82)
Rows removed: 14
Effective data: 1813 days (from 2020-01-15 onwards)


## Step 2: Create Target Variable
Target: Runoff (RO) - binary classification: AMAN (safe) vs BAHAYA (dangerous)

In [4]:
# Use 75th percentile as flood threshold
ro_threshold = df_clean['ro'].quantile(0.75)

print(f'RO statistics:')
print(f'  Min: {df_clean["ro"].min():.8f}')
print(f'  25th percentile: {df_clean["ro"].quantile(0.25):.8f}')
print(f'  Median (50th): {df_clean["ro"].quantile(0.50):.8f}')
print(f'  75th percentile (THRESHOLD): {ro_threshold:.8f}')
print(f'  95th percentile: {df_clean["ro"].quantile(0.95):.8f}')
print(f'  Max: {df_clean["ro"].max():.8f}')

# Binary target: 1 = BAHAYA (high runoff), 0 = AMAN (low runoff)
df_clean['target'] = (df_clean['ro'] > ro_threshold).astype(int)

print(f'\nTarget distribution:')
print(f'  AMAN (0): {(df_clean["target"] == 0).sum()} ({(df_clean["target"] == 0).sum() / len(df_clean) * 100:.1f}%)')
print(f'  BAHAYA (1): {(df_clean["target"] == 1).sum()} ({(df_clean["target"] == 1).sum() / len(df_clean) * 100:.1f}%)')
print(f'  Imbalance ratio: {(df_clean["target"] == 0).sum() / (df_clean["target"] == 1).sum():.2f}:1')

RO statistics:
  Min: 0.00000270
  25th percentile: 0.00001625
  Median (50th): 0.00003231
  75th percentile (THRESHOLD): 0.00007082
  95th percentile: 0.00021696
  Max: 0.00131585

Target distribution:
  AMAN (0): 1360 (75.0%)
  BAHAYA (1): 453 (25.0%)
  Imbalance ratio: 3.00:1


## Step 3: Separate Features and Target
Remove original RO from features (use target instead)

In [5]:
# Select features (exclude 'ro' and 'target')
feature_cols = [col for col in df_clean.columns if col not in ['ro', 'target']]
print(f'Feature columns: {len(feature_cols)}')
print(f'Features: {feature_cols[:10]}... (showing first 10)')

X = df_clean[feature_cols].copy()
y = df_clean['target'].copy()

print(f'\nX shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'X dtypes: {X.dtypes.unique()}')

Feature columns: 81
Features: ['tp', 't2m', 'u10', 'v10', 'swvl1', 'wind_speed', 'tp_lag1', 'tp_lag2', 'tp_lag3', 'tp_lag6']... (showing first 10)

X shape: (1813, 81)
y shape: (1813,)
X dtypes: [dtype('float64') dtype('int64')]


## Step 4: Time-Series Split (3-fold Cross-Validation)
Fold 1: Train 2020-2021, Test 2022
Fold 2: Train 2020-2022, Test 2023
Fold 3: Train 2020-2023, Test 2024

In [6]:
# Extract years from index
years = df_clean.index.year

# Define time-series splits
splits = [
    {
        'fold': 1,
        'train_years': [2020, 2021],
        'test_years': [2022],
        'train_mask': years.isin([2020, 2021]),
        'test_mask': years.isin([2022])
    },
    {
        'fold': 2,
        'train_years': [2020, 2021, 2022],
        'test_years': [2023],
        'train_mask': years.isin([2020, 2021, 2022]),
        'test_mask': years.isin([2023])
    },
    {
        'fold': 3,
        'train_years': [2020, 2021, 2022, 2023],
        'test_years': [2024],
        'train_mask': years.isin([2020, 2021, 2022, 2023]),
        'test_mask': years.isin([2024])
    }
]

print('='*70)
print('TIME-SERIES CROSS-VALIDATION SPLITS')
print('='*70)

for split in splits:
    train_count = split['train_mask'].sum()
    test_count = split['test_mask'].sum()
    print(f"\nFold {split['fold']}:")
    print(f"  Train years: {split['train_years']} ({train_count} samples)")
    print(f"  Test years: {split['test_years']} ({test_count} samples)")
    print(f"  Train/Test ratio: {train_count/test_count:.2f}:1")
    print(f"  Train BAHAYA: {y[split['train_mask']].sum()} / {train_count}")
    print(f"  Test BAHAYA: {y[split['test_mask']].sum()} / {test_count}")

TIME-SERIES CROSS-VALIDATION SPLITS

Fold 1:
  Train years: [2020, 2021] (717 samples)
  Test years: [2022] (365 samples)
  Train/Test ratio: 1.96:1
  Train BAHAYA: 198 / 717
  Test BAHAYA: 109 / 365

Fold 2:
  Train years: [2020, 2021, 2022] (1082 samples)
  Test years: [2023] (365 samples)
  Train/Test ratio: 2.96:1
  Train BAHAYA: 307 / 1082
  Test BAHAYA: 70 / 365

Fold 3:
  Train years: [2020, 2021, 2022, 2023] (1447 samples)
  Test years: [2024] (366 samples)
  Train/Test ratio: 3.95:1
  Train BAHAYA: 377 / 1447
  Test BAHAYA: 76 / 366


## Step 5: Preprocessing with StandardScaler
Normalize features to mean=0, std=1

In [7]:
print('='*70)
print('DATA NORMALIZATION (StandardScaler)')
print('='*70)

# Initialize scaler
scaler = StandardScaler()

# Store preprocessed data for each fold
preprocessed_folds = []

for split in splits:
    fold_num = split['fold']
    train_idx = np.where(split['train_mask'])[0]
    test_idx = np.where(split['test_mask'])[0]
    
    # Fit scaler on training data only (no data leakage)
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    
    scaler_fold = StandardScaler()
    X_train_scaled = scaler_fold.fit_transform(X_train)
    X_test_scaled = scaler_fold.transform(X_test)
    
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]
    
    preprocessed_folds.append({
        'fold': fold_num,
        'X_train': X_train_scaled,
        'X_test': X_test_scaled,
        'y_train': y_train.values,
        'y_test': y_test.values,
        'scaler': scaler_fold,
        'train_dates': df_clean.index[train_idx],
        'test_dates': df_clean.index[test_idx]
    })
    
    print(f"\nFold {fold_num}:")
    print(f"  X_train scaled: mean={X_train_scaled.mean():.6f}, std={X_train_scaled.std():.6f}")
    print(f"  X_test scaled: mean={X_test_scaled.mean():.6f}, std={X_test_scaled.std():.6f}")
    print(f"  y_train distribution: {(y_train == 0).sum()} AMAN, {(y_train == 1).sum()} BAHAYA")
    print(f"  y_test distribution: {(y_test == 0).sum()} AMAN, {(y_test == 1).sum()} BAHAYA")

DATA NORMALIZATION (StandardScaler)

Fold 1:
  X_train scaled: mean=0.000000, std=1.000000
  X_test scaled: mean=0.024991, std=0.966811
  y_train distribution: 519 AMAN, 198 BAHAYA
  y_test distribution: 256 AMAN, 109 BAHAYA

Fold 2:
  X_train scaled: mean=-0.000000, std=1.000000
  X_test scaled: mean=-0.082013, std=0.934815
  y_train distribution: 775 AMAN, 307 BAHAYA
  y_test distribution: 295 AMAN, 70 BAHAYA

Fold 3:
  X_train scaled: mean=0.000000, std=1.000000
  X_test scaled: mean=-0.028680, std=1.054590
  y_train distribution: 1070 AMAN, 377 BAHAYA
  y_test distribution: 290 AMAN, 76 BAHAYA


## Step 6: Apply SMOTE to Training Data (Address Class Imbalance)
Only on training set to avoid data leakage

In [8]:
print('='*70)
print('SMOTE APPLICATION (Synthetic Minority Oversampling)')
print('='*70)

preprocessed_folds_smote = []

for fold_data in preprocessed_folds:
    fold_num = fold_data['fold']
    X_train = fold_data['X_train']
    y_train = fold_data['y_train']
    
    # Apply SMOTE only to training data
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    
    preprocessed_folds_smote.append({
        'fold': fold_num,
        'X_train': X_train_smote,
        'X_test': fold_data['X_test'],
        'y_train': y_train_smote,
        'y_test': fold_data['y_test'],
        'scaler': fold_data['scaler'],
        'train_dates': fold_data['train_dates'],
        'test_dates': fold_data['test_dates']
    })
    
    print(f"\nFold {fold_num}:")
    print(f"  Before SMOTE: {len(y_train)} samples")
    print(f"    - AMAN: {(y_train == 0).sum()}, BAHAYA: {(y_train == 1).sum()}")
    print(f"  After SMOTE: {len(y_train_smote)} samples")
    print(f"    - AMAN: {(y_train_smote == 0).sum()}, BAHAYA: {(y_train_smote == 1).sum()}")
    print(f"  Balanced ratio: 1:1")

SMOTE APPLICATION (Synthetic Minority Oversampling)

Fold 1:
  Before SMOTE: 717 samples
    - AMAN: 519, BAHAYA: 198
  After SMOTE: 1038 samples
    - AMAN: 519, BAHAYA: 519
  Balanced ratio: 1:1

Fold 2:
  Before SMOTE: 1082 samples
    - AMAN: 775, BAHAYA: 307
  After SMOTE: 1550 samples
    - AMAN: 775, BAHAYA: 775
  Balanced ratio: 1:1

Fold 3:
  Before SMOTE: 1447 samples
    - AMAN: 1070, BAHAYA: 377
  After SMOTE: 2140 samples
    - AMAN: 1070, BAHAYA: 1070
  Balanced ratio: 1:1

Fold 1:
  Before SMOTE: 717 samples
    - AMAN: 519, BAHAYA: 198
  After SMOTE: 1038 samples
    - AMAN: 519, BAHAYA: 519
  Balanced ratio: 1:1

Fold 2:
  Before SMOTE: 1082 samples
    - AMAN: 775, BAHAYA: 307
  After SMOTE: 1550 samples
    - AMAN: 775, BAHAYA: 775
  Balanced ratio: 1:1

Fold 3:
  Before SMOTE: 1447 samples
    - AMAN: 1070, BAHAYA: 377
  After SMOTE: 2140 samples
    - AMAN: 1070, BAHAYA: 1070
  Balanced ratio: 1:1


## Step 7: Save Preprocessed Data

In [9]:
import pickle

# Save all preprocessed folds
output_path = 'd:/S2/prediksi - hujan/preprocessed_data_folds.pkl'
with open(output_path, 'wb') as f:
    pickle.dump(preprocessed_folds_smote, f)

print(f'Preprocessed data saved to: {output_path}')
print(f'\nSummary:')
print(f'  - Total folds: {len(preprocessed_folds_smote)}')
print(f'  - Features per sample: {preprocessed_folds_smote[0]["X_train"].shape[1]}')
print(f'  - Normalization: StandardScaler (fitted on train only)')
print(f'  - Class balance: SMOTE applied to training sets')
print(f'  - Time-series integrity: Maintained (no future data leakage)')

Preprocessed data saved to: d:/S2/prediksi - hujan/preprocessed_data_folds.pkl

Summary:
  - Total folds: 3
  - Features per sample: 81
  - Normalization: StandardScaler (fitted on train only)
  - Class balance: SMOTE applied to training sets
  - Time-series integrity: Maintained (no future data leakage)


## Summary Statistics

In [10]:
print('\n' + '='*70)
print('PREPROCESSING COMPLETE - SUMMARY')
print('='*70)

print(f'\n1. DATA CLEANING')
print(f'   - Removed NaN rows (lagged features): 14 rows')
print(f'   - Final dataset: {df_clean.shape[0]} days')

print(f'\n2. TARGET VARIABLE')
print(f'   - Definition: RO > 75th percentile (high runoff)')
print(f'   - Threshold: {ro_threshold:.8f} m')
print(f'   - Class distribution: {(y == 0).sum()} AMAN, {(y == 1).sum()} BAHAYA')
print(f'   - Original imbalance ratio: {(y == 0).sum() / (y == 1).sum():.2f}:1')

print(f'\n3. FEATURES')
print(f'   - Total features: {X.shape[1]}')
print(f'   - Normalization: StandardScaler (zero mean, unit variance)')
print(f'   - Fit strategy: On training set only (no data leakage)')

print(f'\n4. TIME-SERIES CROSS-VALIDATION')
print(f'   - Strategy: 3-fold forward chaining')
print(f'   - Fold 1: Train 2020-2021 (731 days) → Test 2022 (366 days)')
print(f'   - Fold 2: Train 2020-2022 (1097 days) → Test 2023 (365 days)')
print(f'   - Fold 3: Train 2020-2023 (1462 days) → Test 2024 (361 days)')

print(f'\n5. CLASS BALANCE (SMOTE)')
print(f'   - Applied: Only to training sets')
print(f'   - Result: Balanced training (AMAN:BAHAYA = 1:1)')
print(f'   - Test sets: Kept original distribution (for realistic evaluation)')

print(f'\n6. OUTPUT FILES')
print(f'   - Preprocessed folds: {output_path}')
print(f'   - Format: Pickle (Python binary)')
print(f'   - Contains: 3 folds with X_train, X_test, y_train, y_test, scaler')

print(f'\n7. READY FOR MODELING')
print(f'   - Next phase: XGBoost + LSTM training')
print(f'   - Input shape per fold: (n_samples, {X.shape[1]})')
print(f'   - Output: Binary classification (AMAN/BAHAYA)')


PREPROCESSING COMPLETE - SUMMARY

1. DATA CLEANING
   - Removed NaN rows (lagged features): 14 rows
   - Final dataset: 1813 days

2. TARGET VARIABLE
   - Definition: RO > 75th percentile (high runoff)
   - Threshold: 0.00007082 m
   - Class distribution: 1360 AMAN, 453 BAHAYA
   - Original imbalance ratio: 3.00:1

3. FEATURES
   - Total features: 81
   - Normalization: StandardScaler (zero mean, unit variance)
   - Fit strategy: On training set only (no data leakage)

4. TIME-SERIES CROSS-VALIDATION
   - Strategy: 3-fold forward chaining
   - Fold 1: Train 2020-2021 (731 days) → Test 2022 (366 days)
   - Fold 2: Train 2020-2022 (1097 days) → Test 2023 (365 days)
   - Fold 3: Train 2020-2023 (1462 days) → Test 2024 (361 days)

5. CLASS BALANCE (SMOTE)
   - Applied: Only to training sets
   - Result: Balanced training (AMAN:BAHAYA = 1:1)
   - Test sets: Kept original distribution (for realistic evaluation)

6. OUTPUT FILES
   - Preprocessed folds: d:/S2/prediksi - hujan/preprocessed_da