In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Configuration
INPUT_FILE = 'era5_daily_features.csv'
WINDOW_SIZE = 7  # Days to look back
np.random.seed(42)

print("✓ Imports loaded")
print(f"Configuration: Window size = {WINDOW_SIZE} days")


✓ Imports loaded
Configuration: Window size = 7 days


In [11]:
# Load data
df = pd.read_csv(INPUT_FILE, parse_dates=['date'])
print(f"Loaded {len(df):,} rows")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"\nColumns ({len(df.columns)}): {list(df.columns)}")
print(f"\nNumber of locations: {df.groupby(['latitude', 'longitude']).ngroups}")

# Check for target columns
target_cols = ['t2m_min_next', 't2m_max_next', 't2m_mean_next']
has_targets = all(col in df.columns for col in target_cols)

if has_targets:
    print("\n✓ Target columns found:", target_cols)
else:
    print("\n⚠️  WARNING: Missing target columns!")
    print("Expected:", target_cols)

# Display sample
df.head()

Loaded 70,200 rows
Date range: 2025-01-01 00:00:00 to 2025-01-30 00:00:00

Columns (27): ['date', 'latitude', 'longitude', 't2m_min', 't2m_max', 't2m_mean', 'd2m_min', 'd2m_max', 'd2m_mean', 'msl_min', 'msl_max', 'msl_mean', 'u10_min', 'u10_max', 'u10_mean', 'v10_min', 'v10_max', 'v10_mean', 'tcc_min', 'tcc_max', 'tcc_mean', 'skt_min', 'skt_max', 'skt_mean', 't2m_min_next', 't2m_max_next', 't2m_mean_next']

Number of locations: 2340

✓ Target columns found: ['t2m_min_next', 't2m_max_next', 't2m_mean_next']


Unnamed: 0,date,latitude,longitude,t2m_min,t2m_max,t2m_mean,d2m_min,d2m_max,d2m_mean,msl_min,...,v10_mean,tcc_min,tcc_max,tcc_mean,skt_min,skt_max,skt_mean,t2m_min_next,t2m_max_next,t2m_mean_next
0,2025-01-01,20.0,-9.8,8.079742,19.427155,13.512838,-2.99888,-0.61387,-2.06268,1017.8256,...,-3.016637,0.0,0.984451,0.293353,4.30374,26.4745,13.539474,8.284332,20.2294,13.733927
1,2025-01-02,20.0,-9.8,8.284332,20.2294,13.733927,-7.56992,-2.55112,-4.270289,1018.2197,...,-2.549742,0.0,0.49585,0.05305,4.47048,25.16592,13.271984,8.742096,20.117584,13.730153
2,2025-01-03,20.0,-9.8,8.742096,20.117584,13.730153,-7.83945,0.5934,-2.82104,1017.91875,...,-2.010354,0.0,0.424484,0.053706,5.53707,25.11807,13.454448,8.493073,19.262848,13.515086
3,2025-01-04,20.0,-9.8,8.493073,19.262848,13.515086,-4.86875,-0.84434,-2.462602,1017.338,...,-1.404602,0.545135,1.0,0.881154,5.31387,25.0065,13.461613,9.574616,21.778473,14.880564
4,2025-01-05,20.0,-9.8,9.574616,21.778473,14.880564,-3.36777,0.4994,-1.195697,1014.9186,...,-2.352264,0.0,1.0,0.470805,5.87686,29.31753,15.110692,8.145905,22.907135,14.977478


In [13]:
# Exclude targets and identifiers from features
exclude_cols = target_cols + ['date', 'latitude', 'longitude']
feature_cols = [c for c in df.columns if c not in exclude_cols]

print(f"=== Feature Columns ({len(feature_cols)}) ===")
for i, col in enumerate(feature_cols, 1):
    print(f"{i:2d}. {col}")

print(f"\n=== Target Columns ({len(target_cols)}) ===")
for i, col in enumerate(target_cols, 1):
    print(f"{i}. {col}")

=== Feature Columns (21) ===
 1. t2m_min
 2. t2m_max
 3. t2m_mean
 4. d2m_min
 5. d2m_max
 6. d2m_mean
 7. msl_min
 8. msl_max
 9. msl_mean
10. u10_min
11. u10_max
12. u10_mean
13. v10_min
14. v10_max
15. v10_mean
16. tcc_min
17. tcc_max
18. tcc_mean
19. skt_min
20. skt_max
21. skt_mean

=== Target Columns (3) ===
1. t2m_min_next
2. t2m_max_next
3. t2m_mean_next


In [17]:
# ============================================
# COMPREHENSIVE REGION & DATA ANALYSIS
# ============================================
print("\n" + "="*70)
print("REGION & DATA COVERAGE ANALYSIS")
print("="*70)

# 1. COUNT UNIQUE REGIONS
regions = df.groupby(['latitude', 'longitude']).size().reset_index(name='row_count')
n_regions = len(regions)

print(f"\n--- Region Statistics ---")
print(f"Total unique regions (lat/lon combinations): {n_regions}")
print(f"Average rows per region: {regions['row_count'].mean():.1f}")
print(f"Min rows per region: {regions['row_count'].min()}")
print(f"Max rows per region: {regions['row_count'].max()}")
print(f"Median rows per region: {regions['row_count'].median():.1f}")

# Show region distribution
print(f"\n--- Region Distribution ---")
print(regions.sort_values('row_count', ascending=False).head(10))

# 2. TEMPORAL COVERAGE PER REGION
print(f"\n--- Temporal Coverage Per Region ---")
temporal_stats = df.groupby(['latitude', 'longitude']).agg({
    'date': ['min', 'max', 'count']
}).reset_index()
temporal_stats.columns = ['latitude', 'longitude', 'first_date', 'last_date', 'n_days']
temporal_stats['duration_days'] = (pd.to_datetime(temporal_stats['last_date']) - 
                                    pd.to_datetime(temporal_stats['first_date'])).dt.days + 1
temporal_stats['coverage_pct'] = (temporal_stats['n_days'] / temporal_stats['duration_days'] * 100)

print(temporal_stats.head(10))

print(f"\nOverall temporal statistics:")
print(f"  Mean coverage: {temporal_stats['coverage_pct'].mean():.1f}%")
print(f"  Min days per region: {temporal_stats['n_days'].min()}")
print(f"  Max days per region: {temporal_stats['n_days'].max()}")
print(f"  Total duration range: {temporal_stats['duration_days'].min()} - {temporal_stats['duration_days'].max()} days")




REGION & DATA COVERAGE ANALYSIS

--- Region Statistics ---
Total unique regions (lat/lon combinations): 2340
Average rows per region: 30.0
Min rows per region: 30
Max rows per region: 30
Median rows per region: 30.0

--- Region Distribution ---
      latitude  longitude  row_count
2339      36.0      -1.05         30
0         20.0      -9.80         30
1         20.0      -9.55         30
2         20.0      -9.30         30
2323      36.0      -5.05         30
2322      36.0      -5.30         30
2321      36.0      -5.55         30
2320      36.0      -5.80         30
2319      36.0      -6.05         30
2318      36.0      -6.30         30

--- Temporal Coverage Per Region ---
   latitude  longitude first_date  last_date  n_days  duration_days  \
0      20.0      -9.80 2025-01-01 2025-01-30      30             30   
1      20.0      -9.55 2025-01-01 2025-01-30      30             30   
2      20.0      -9.30 2025-01-01 2025-01-30      30             30   
3      20.0      -9.05 20

In [18]:
# ============================================
# DATA LEAKAGE PREDICTION & ANALYSIS
# ============================================
print("\n" + "="*70)
print("DATA LEAKAGE RISK ASSESSMENT")
print("="*70)

# 3. THEORETICAL vs ACTUAL SAMPLES
print("\n--- Sample Generation Analysis ---")
theoretical_samples = sum([max(0, count - WINDOW_SIZE) for count in regions['row_count']])
actual_samples = len(X_agg)

print(f"Theoretical max samples (rows - window_size): {theoretical_samples:,}")
print(f"Actual samples created: {actual_samples:,}")
print(f"Loss due to targets/NaN: {theoretical_samples - actual_samples:,} ({(1 - actual_samples/theoretical_samples)*100:.2f}%)")

# 4. FEATURE-TARGET TEMPORAL RELATIONSHIP
print("\n--- Feature-Target Temporal Relationship ---")

# Randomly sample to check temporal integrity
sample_indices = np.random.choice(len(X_agg), min(100, len(X_agg)), replace=False)

temporal_leaks = 0
for idx in sample_indices:
    pred_date = X_agg['date'].iloc[idx]
    lat, lon = X_agg['latitude'].iloc[idx], X_agg['longitude'].iloc[idx]
    
    # Get original data for this region
    region_data = df[(df['latitude'] == lat) & (df['longitude'] == lon)].sort_values('date')
    
    # Find the prediction date in original data
    pred_row_idx = region_data[region_data['date'] == pred_date].index
    
    if len(pred_row_idx) > 0:
        pred_row_idx = region_data.index.get_loc(pred_row_idx[0])
        
        # Check if any feature uses data from prediction date or after
        if pred_row_idx < WINDOW_SIZE:
            temporal_leaks += 1

leak_pct = (temporal_leaks / len(sample_indices)) * 100
print(f"Samples checked: {len(sample_indices)}")
print(f"Temporal leaks found: {temporal_leaks} ({leak_pct:.2f}%)")

if leak_pct > 0:
    print("⚠️  WARNING: Features using data from prediction date or future!")
else:
    print("✓ No temporal leakage detected")


# 5. TARGET INDEPENDENCE ANALYSIS
print("\n--- Target Independence Analysis ---")

independence_results = {}

for target_col in target_cols:
    base_col = target_col.replace('_next', '')
    
    # Check correlation with different time lags
    correlations = {}
    
    if f"{base_col}_last" in X_agg.columns:
        # Last value (t-1)
        corr_last = X_agg[f"{base_col}_last"].corr(y_agg[target_col])
        correlations['last_value_t-1'] = corr_last
        
    if f"{base_col}_mean" in X_agg.columns:
        # Window mean
        corr_mean = X_agg[f"{base_col}_mean"].corr(y_agg[target_col])
        correlations['window_mean'] = corr_mean
    
    if f"{base_col}_trend" in X_agg.columns:
        # Trend
        corr_trend = X_agg[f"{base_col}_trend"].corr(y_agg[target_col])
        correlations['trend'] = corr_trend
    
    independence_results[target_col] = correlations
    
    print(f"\n{target_col}:")
    for feat_name, corr in correlations.items():
        status = "⚠️  HIGH" if abs(corr) > 0.95 else "✓ Normal"
        print(f"  Correlation with {feat_name:20s}: {corr:6.4f}  {status}")
        
        if abs(corr) > 0.95:
            print(f"    → Potential leakage: target too similar to feature")


# 6. VALUE MATCHING ANALYSIS
print("\n--- Value Matching Analysis (Leakage Detection) ---")

for target_col in target_cols:
    base_col = target_col.replace('_next', '')
    
    if f"{base_col}_last" in X_agg.columns:
        last_feature = X_agg[f"{base_col}_last"].values
        target_values = y_agg[target_col].values
        
        # Check exact matches (suspicious)
        exact_matches = (last_feature == target_values).sum()
        
        # Check very close matches (< 0.01K difference)
        close_matches = (np.abs(last_feature - target_values) < 0.01).sum()
        
        # Check reasonable matches (< 1K difference - expected for persistent weather)
        reasonable_matches = (np.abs(last_feature - target_values) < 1.0).sum()
        
        total = len(target_values)
        
        print(f"\n{target_col} vs {base_col}_last:")
        print(f"  Exact matches (0K diff):      {exact_matches:5d} / {total:5d} ({exact_matches/total*100:5.2f}%)")
        print(f"  Very close (< 0.01K diff):    {close_matches:5d} / {total:5d} ({close_matches/total*100:5.2f}%)")
        print(f"  Reasonable (< 1K diff):       {reasonable_matches:5d} / {total:5d} ({reasonable_matches/total*100:5.2f}%)")
        
        # Interpretation
        if exact_matches / total > 0.5:
            print(f"  🚨 CRITICAL: >50% exact matches - DEFINITE LEAKAGE!")
        elif exact_matches / total > 0.1:
            print(f"  ⚠️  WARNING: >10% exact matches - Possible leakage")
        elif close_matches / total > 0.5:
            print(f"  ⚠️  SUSPICIOUS: >50% very close matches")
        elif reasonable_matches / total > 0.7:
            print(f"  ✓ Normal: High persistence expected in weather data")
        else:
            print(f"  ✓ Good: Target shows independence from recent values")


# 7. CROSS-VALIDATION LEAKAGE CHECK
print("\n--- Cross-Validation Leakage Risk ---")

# Check if dates overlap between train/test in different regions
date_ranges_per_region = df.groupby(['latitude', 'longitude'])['date'].agg(['min', 'max'])
date_ranges_per_region['span'] = (pd.to_datetime(date_ranges_per_region['max']) - 
                                   pd.to_datetime(date_ranges_per_region['min'])).dt.days

print(f"\nRegions with overlapping time periods:")
min_date = df['date'].min()
max_date = df['date'].max()
print(f"  Global date range: {min_date} to {max_date}")
print(f"  Total span: {(max_date - min_date).days} days")

# Check if all regions span the same time
regions_full_span = (date_ranges_per_region['span'] == (max_date - min_date).days).sum()
print(f"  Regions covering full span: {regions_full_span} / {n_regions}")

if regions_full_span == n_regions:
    print("  ⚠️  All regions have same temporal coverage")
    print("  → Risk: Regional leakage if not using location-aware splits")
else:
    print("  ✓ Regions have different temporal coverage")
    print("  → Lower risk of spatial leakage")


# 8. FINAL SUMMARY
print("\n" + "="*70)
print("LEAKAGE ASSESSMENT SUMMARY")
print("="*70)

leakage_score = 0
max_score = 5

# Criterion 1: Temporal integrity
if leak_pct == 0:
    print("✓ Temporal integrity: PASS (no future data in features)")
    leakage_score += 1
else:
    print(f"✗ Temporal integrity: FAIL ({leak_pct:.1f}% temporal leaks)")

# Criterion 2: Target independence (correlation)
max_corr = max([max(corrs.values()) for corrs in independence_results.values()])
if max_corr < 0.95:
    print(f"✓ Correlation check: PASS (max correlation = {max_corr:.4f})")
    leakage_score += 1
else:
    print(f"✗ Correlation check: FAIL (max correlation = {max_corr:.4f})")

# Criterion 3: Exact value matches
avg_exact_match = np.mean([
    (X_agg[f"{col.replace('_next', '')}_last"] == y_agg[col]).sum() / len(y_agg)
    for col in target_cols if f"{col.replace('_next', '')}_last" in X_agg.columns
])
if avg_exact_match < 0.1:
    print(f"✓ Value matching: PASS (avg exact match = {avg_exact_match*100:.2f}%)")
    leakage_score += 1
else:
    print(f"✗ Value matching: FAIL (avg exact match = {avg_exact_match*100:.2f}%)")

# Criterion 4: Sample count integrity
sample_loss_pct = (1 - actual_samples/theoretical_samples)*100
if sample_loss_pct < 15:
    print(f"✓ Sample integrity: PASS ({sample_loss_pct:.1f}% loss is acceptable)")
    leakage_score += 1
else:
    print(f"⚠️  Sample integrity: WARNING ({sample_loss_pct:.1f}% loss)")

# Criterion 5: No NaN contamination
if X_agg[feature_only_cols].isnull().sum().sum() == 0 and y_agg.isnull().sum().sum() == 0:
    print("✓ Data quality: PASS (no NaN values)")
    leakage_score += 1
else:
    print("✗ Data quality: FAIL (NaN values present)")

print(f"\n{'='*70}")
print(f"OVERALL LEAKAGE SCORE: {leakage_score}/{max_score}")
print(f"{'='*70}")

if leakage_score == max_score:
    print("🎉 EXCELLENT: No data leakage detected. Safe to train!")
elif leakage_score >= 3:
    print("✓ GOOD: Minor issues but generally safe to proceed")
elif leakage_score >= 2:
    print("⚠️  WARNING: Some leakage concerns. Review carefully before training")
else:
    print("🚨 CRITICAL: Significant leakage detected. DO NOT TRAIN until fixed!")

print(f"{'='*70}\n")


DATA LEAKAGE RISK ASSESSMENT

--- Sample Generation Analysis ---
Theoretical max samples (rows - window_size): 53,820
Actual samples created: 53,820
Loss due to targets/NaN: 0 (0.00%)

--- Feature-Target Temporal Relationship ---
Samples checked: 100
Temporal leaks found: 0 (0.00%)
✓ No temporal leakage detected

--- Target Independence Analysis ---

t2m_min_next:
  Correlation with last_value_t-1      : 0.8038  ✓ Normal
  Correlation with window_mean         : 0.7319  ✓ Normal
  Correlation with trend               : 0.3570  ✓ Normal

t2m_max_next:
  Correlation with last_value_t-1      : 0.7723  ✓ Normal
  Correlation with window_mean         : 0.5810  ✓ Normal
  Correlation with trend               : 0.5145  ✓ Normal

t2m_mean_next:
  Correlation with last_value_t-1      : 0.8232  ✓ Normal
  Correlation with window_mean         : 0.6659  ✓ Normal
  Correlation with trend               : 0.4899  ✓ Normal

--- Value Matching Analysis (Leakage Detection) ---

t2m_min_next vs t2m_min_l

In [19]:
# Save aggregated features and targets to CSV
X_agg.to_csv('features_aggregated.csv', index=False)
y_agg.to_csv('targets_aggregated.csv', index=False)

print(f"✓ Saved features_aggregated.csv ({X_agg.shape[0]} rows, {X_agg.shape[1]} columns)")
print(f"✓ Saved targets_aggregated.csv ({y_agg.shape[0]} rows, {y_agg.shape[1]} columns)")

# Optional: Save combined (features + targets in one file)
combined = pd.concat([X_agg, y_agg], axis=1)
combined.to_csv('data_combined_aggregated.csv', index=False)
print(f"✓ Saved data_combined_aggregated.csv ({combined.shape[0]} rows, {combined.shape[1]} columns)")

✓ Saved features_aggregated.csv (53820 rows, 171 columns)
✓ Saved targets_aggregated.csv (53820 rows, 3 columns)
✓ Saved data_combined_aggregated.csv (53820 rows, 174 columns)


# 📊 Feature Engineering: Understanding Window-Based Features

## 🎯 What Features Are Created?

For a **7-day sliding window**, each original feature (e.g., `t2m_min`) is transformed into **8 statistical features**:

| Feature Name | Description | Time Reference | Purpose |
|--------------|-------------|----------------|---------|
| `t2m_min_last` | Most recent value | Day t-1 (yesterday) | Captures immediate persistence |
| `t2m_min_first` | Oldest value in window | Day t-7 (7 days ago) | Baseline reference point |
| `t2m_min_mean` | Average over window | Days t-7 to t-1 | Overall trend level |
| `t2m_min_std` | Standard deviation | Days t-7 to t-1 | Variability/stability |
| `t2m_min_min` | Minimum in window | Days t-7 to t-1 | Extreme cold point |
| `t2m_min_max` | Maximum in window | Days t-7 to t-1 | Extreme warm point |
| `t2m_min_trend` | Change (last - first) | t-1 minus t-7 | Warming/cooling direction |
| `t2m_min_recent_3d` | Recent average | Days t-3, t-2, t-1 | Short-term trend |

---

## 📅 Temporal Structure

### Timeline Visualization:

```
Past ←─────────────────────────────── Present ───→ Future (Prediction)
┌────┬────┬────┬────┬────┬────┬────┬────────────┬──────────┐
│t-7 │t-6 │t-5 │t-4 │t-3 │t-2 │t-1 │   TODAY    │ TOMORROW │
└────┴────┴────┴────┴────┴────┴────┴────────────┴──────────┘
  ↑                        └────┬─────┘      ↑         ↑
  │                             │            │         │
_first                    _recent_3d      _last    TARGET
                                                  (predict this)
```

### Feature Coverage:

- **`_first`** = Day t-7 (start of window)
- **`_recent_3d`** = Average of Days t-3, t-2, t-1
- **`_last`** = Day t-1 (most recent observation)
- **Target** = Day t (what we're predicting)

---

## 🔍 Lag Interpretation

Our aggregated features already **implicitly contain lag information**:

| Aggregated Feature | Equivalent Lag Concept |
|--------------------|------------------------|
| `_last` | Lag 1 (yesterday) |
| `_recent_3d` | Average of Lags 1, 2, 3 |
| `_first` | Lag 7 (week ago) |
| `_mean` | Average of Lags 1-7 |
| `_trend` | Lag 7 → Lag 1 change |

### Why This Is Better Than Explicit Lags:

**Option A: Aggregated Features (Current Approach)** ✅
```python
# 8 features per variable
t2m_min_mean, t2m_min_std, t2m_min_min, t2m_min_max,
t2m_min_last, t2m_min_first, t2m_min_trend, t2m_min_recent_3d
```

**Option B: Explicit Lags** ❌ (Not recommended for Random Forest)
```python
# 7 features per variable
t2m_min_lag1, t2m_min_lag2, t2m_min_lag3, t2m_min_lag4,
t2m_min_lag5, t2m_min_lag6, t2m_min_lag7
```

### Advantages of Aggregated Features:

| Aspect | Aggregated | Explicit Lags |
|--------|-----------|---------------|
| **Captures trends** | ✅ Yes (_trend, _mean) | ❌ No |
| **Captures variability** | ✅ Yes (_std) | ❌ No |
| **Handles noise** | ✅ Better (averaging) | ❌ Sensitive |
| **Feature count** | ✅ Compact (8 per var) | ❌ Grows with lags |
| **Works with RF** | ✅ Excellent | ⚠️ Okay |
| **Works with LSTM** | ⚠️ Okay | ✅ Excellent |

---

## ⚠️ Data Leakage Considerations

### ✅ Safe: Time-Based Splits

```python
# Correct approach (what we use)
Train: [════════════════════════] 80% (oldest data)
Test:                           [═══] 20% (newest data)
       
Timeline: 2020 ──────────────→ 2023 ────→ 2024
```

**No leakage** because:
- Training data is from the past
- Test data is from the future
- No information flows backward in time

### ⚠️ Risk: Regional Leakage with Random Splits

```python
# WRONG: Random split across all regions
Region A: Train [═══] Test [═══] Train [═══] Test
Region B: Test [═══] Train [═══] Test [═══]
          
Same dates mixed in train and test → LEAKAGE!
```

**Problem**: If Region A's training data contains the same dates as Region B's test data, the model learns weather patterns that appear in test set.

### ✅ Solution: Location-Aware or Time-Based Splitting

**Option 1: Time-based split (Current approach)**
```python
# All regions follow same temporal split
split_date = '2023-09-01'
train = data[data['date'] < split_date]
test = data[data['date'] >= split_date]
```

**Option 2: Location-based split (Alternative)**
```python
# Train on some locations, test on others
train_locations = regions[:40]  # 80% of locations
test_locations = regions[40:]   # 20% of locations
```

---

## 📈 Feature Importance Example

Typical importance ranking for temperature prediction:

```
1. t2m_mean_last          ███████████████████████ 18.5%
2. t2m_mean_recent_3d     ████████████████████ 15.2%
3. t2m_mean_mean          ███████████████ 12.8%
4. t2m_max_last           ████████████ 10.3%
5. t2m_mean_trend         ██████████ 8.7%
6. msl_mean_mean          ████████ 6.4%
7. t2m_min_last           ███████ 5.9%
8. d2m_mean_recent_3d     ██████ 4.8%
...
```

**Key insights**:
- Recent values (`_last`, `_recent_3d`) are most important
- Mean temperature features dominate
- Pressure (`msl`) provides additional signal
- Humidity (`d2m`) helps with precision

---

## 🎓 Summary

### What You Have:
- ✅ **Window size = 7 days** of historical weather
- ✅ **8 statistical features** per original variable
- ✅ **Implicit lag structure** (lag 1, lags 1-3, lag 7)
- ✅ **Time-based train/test split** (no temporal leakage)

### What You DON'T Need:
- ❌ Explicit lag features (redundant with current approach)
- ❌ More complex lag structures (RF handles this)
- ❌ Additional recent_Xd features (recent_3d is sufficient)

### When to Add Explicit Lags:
Only if:
1. Using LSTM/RNN models (need sequential input)
2. Model performance is poor (R² < 0.5)
3. Domain knowledge suggests specific periodicities

---

## 💡 Best Practice

**For Random Forest temperature prediction:**

```python
# ✅ RECOMMENDED: Current aggregated approach
window_size = 7
features_per_variable = 8  # mean, std, min, max, last, first, trend, recent_3d

# ❌ NOT NEEDED: Explicit lags
lag_features = [lag1, lag2, lag3, ...]  # Redundant for RF
```

**Your current feature engineering is optimal for the task!** 🎯