In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from datetime import datetime, timedelta
import random
import json

# --- Core ML Pipeline Class (OOP) ---

class WeatherForecastSystem:
    """
    Encapsulates the entire ML weather forecasting pipeline.
    It handles data loading, feature engineering, model training, and forecasting.
    """
    
    # Define targets and excluded columns as class-level constants
    TARGETS = [
        'T2M_MAX', 'T2M_MIN', 'T2M', 'RH2M', 
        'WS10M', 'WD10M', 'PRECTOTCORR', 'PS' # PS is included for feature engineering, but not predicted
    ]
    
    def __init__(self, city_name="Phnom Penh"):
        """
        Initializes the system, loading data from 'PP_test5nasa.csv'.
        """
        self.city_name = city_name
        self.raw_data = None
        self.engineered_data = None
        self.models = {}
        self.training_summary = {}
        self.feature_cols = []
        self.last_valid_day = None

        data_file = 'PP_test5nasa.csv'
        try:
            # Load the specified training data file, skipping header rows
            self.raw_data = pd.read_csv(data_file, skiprows=16) 
            print(f"SUCCESS: Loaded data from '{data_file}'.")
        except FileNotFoundError:
            # Raise an error if the required CSV file is missing
            raise FileNotFoundError(f"ERROR: Training data file '{data_file}' not found. Please ensure it is available in the current directory.")

    def _clean_data(self):
        """
        Initial data cleaning and transformation (renaming, setting index,
        handling missing values, and generating wind direction features).
        """
        df = self.raw_data.copy()
        
        # 1. Create date column and set index (keeping variable names from original code)
        df = df.rename(columns={'YEAR': 'year', 'MO': 'month', 'DY': 'day'})
        df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
        df = df.set_index('date').sort_index()
        
        # 2. Handle missing values (-999 in NASA POWER data)
        df = df.replace(-999, np.nan)
        # Drop rows where any of the target columns have missing data
        df = df.dropna(subset=self.TARGETS, how='any')  
        
        # 3. Feature calculation (Wind direction components)
        df['wind_dir_rad'] = np.deg2rad(df['WD10M'])
        df['wind_dir_sin'] = np.sin(df['wind_dir_rad'])
        df['wind_dir_cos'] = np.cos(df['wind_dir_rad'])
        
        self.raw_data = df
        print(f"Loaded and cleaned {len(df)} valid days up to {df.index[-1].date()}")
        return df

    def _engineer_features(self, df):
        """
        Creates time-series (lag) and rolling statistical features.
        """
        df = df.copy()
        targets_to_engineer = [t for t in self.TARGETS if t != 'PS'] 
        
        # Lags: Past 1-7 days for all targets
        for col in targets_to_engineer:
            for lag in range(1, 8):
                df[f'{col}_lag{lag}'] = df[col].shift(lag)
        
        # Rolling stats for key variables
        df['temp_max_roll_mean_3'] = df['T2M_MAX'].rolling(3).mean()
        df['temp_max_roll_mean_7'] = df['T2M_MAX'].rolling(7).mean()
        df['precip_roll_sum_7'] = df['PRECTOTCORR'].rolling(7).sum()
        df['humidity_roll_mean_7'] = df['RH2M'].rolling(7).mean()
        df['pressure_roll_mean_7'] = df['PS'].rolling(7).mean()
        
        # Date features (seasonality)
        df['month'] = df.index.month
        df['day'] = df.index.day
        df['dayofweek'] = df.index.dayofweek
        df['is_weekend'] = (df.index.dayofweek >= 5).astype(int)
        
        # Drop rows with NaN from shifting/rolling operations
        return df.dropna()

    def train_model(self):
        """
        Trains the XGBoost model for each target variable required for forecasting.
        The performance metric (MAE) is stored in self.training_summary.
        """
        data = self._clean_data()
        self.engineered_data = self._engineer_features(data)
        
        # Targets that are actually being predicted (excluding pressure, which is an input feature)
        targets_for_training = [t for t in self.TARGETS if t not in ['PS']]

        # Define features: lags, rolling stats, date features, pressure, and wind direction components
        exclude_cols = targets_for_training + ['WD10M', 'wind_dir_rad', 'WS10M']
        self.feature_cols = [col for col in self.engineered_data.columns 
                             if col not in exclude_cols and ('_lag' in col or 'roll' in col or col in ['month', 'day', 'dayofweek', 'is_weekend', 'PS', 'wind_dir_sin', 'wind_dir_cos'])]
        
        # Filter for unique feature columns
        self.feature_cols = list(set(self.feature_cols)) 

        X = self.engineered_data[self.feature_cols]
        self.last_valid_day = self.engineered_data.iloc[-1]
        
        print(f"\nTraining XGBoost models on {len(X)} samples...")
        
        for target in targets_for_training:
            print(f"  → Training model for {target}...")
            
            # XGBoost Model configuration (as in original code)
            model = xgb.XGBRegressor(
                n_estimators=800, learning_rate=0.03, max_depth=6, subsample=0.8,
                colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=1.0,
                random_state=42, n_jobs=-1, tree_method='hist' 
            )
            
            model.fit(X, self.engineered_data[target])
            self.models[target] = model
            
            # Quick accuracy check on full training data
            y_pred = model.predict(X)
            mae = mean_absolute_error(self.engineered_data[target], y_pred)
            # Store training summary information
            unit = "°C" if 'T2M' in target else ""
            self.training_summary[target] = f"MAE: {mae:.3f}{unit}"
            print(f"     Accuracy on training set: {self.training_summary[target]}")

        print("Training complete! Ready for forecasting.")

    @staticmethod
    def _degrees_to_direction(deg):
        """
        Utility function to convert degrees to cardinal direction (N, NE, E, etc.).
        """
        deg = deg % 360
        if deg < 22.5 or deg >= 337.5:   return "N"
        elif deg < 67.5:                 return "NE"
        elif deg < 112.5:                return "E"
        elif deg < 157.5:                return "SE"
        elif deg < 202.5:                return "S"
        elif deg < 247.5:                return "SW"
        elif deg < 292.5:                return "W"
        else:                            return "NW"

    def forecast_next_n_days(self, days=7):
        """
        Recursively forecasts the next N days by using the previous day's
        prediction as input features for the current day's prediction.
        """
        if not self.models or self.last_valid_day is None:
            raise RuntimeError("Model must be trained before forecasting.")

        current_row = self.last_valid_day.copy()
        forecasts = []
        targets_for_forecasting = [t for t in self.TARGETS if t not in ['PS']]

        print(f"\nGenerating recursive forecast for the next {days} days...")

        for i in range(1, days + 1):
            # 1. Prepare input for prediction
            X = current_row[self.feature_cols].values.reshape(1, -1)
            pred = {}
            
            # 2. Predict all targets for the next day
            for target in targets_for_forecasting:
                pred[target] = self.models[target].predict(X)[0]
            
            # Apply constraints (e.g., precipitation must be non-negative)
            pred['PRECTOTCORR'] = max(0.0, pred['PRECTOTCORR'])
            
            forecast_date = current_row.name + timedelta(days=1)
            
            # Determine weather description
            precip = pred['PRECTOTCORR']
            temp_mean = pred['T2M']
            if precip < 0.5:
                desc = "Sunny" if temp_mean >= 33 else "Mostly Sunny" if temp_mean >= 30 else "Partly Cloudy"
            elif precip < 3:
                desc = "Light Rain"
            elif precip < 10:
                desc = "Rainy"
            else:
                desc = "Heavy Rain"

            # Store results in the visualization-friendly structure
            forecasts.append({
                "city": self.city_name,
                "date": forecast_date.strftime("%Y-%m-%d"),
                "max_temp": round(pred['T2M_MAX'], 1),
                "min_temp": round(pred['T2M_MIN'], 1),
                "mean_temp": round(pred['T2M'], 1),
                "rain_mm": round(pred['PRECTOTCORR'], 1),
                "wind_speed_kmh": round(pred['WS10M'] * 3.6, 1),
                "wind_dir_deg": round(pred['WD10M'], 0),
                "wind_direction": self._degrees_to_direction(pred['WD10M']),
                "description": desc
            })

            # 3. Update 'current_row' (feature vector) for the next iteration (day i+1)
            new_row = current_row.copy()
            new_row.name = forecast_date
            
            # Update date features
            new_row['month'] = forecast_date.month
            new_row['day'] = forecast_date.day
            new_row['dayofweek'] = forecast_date.dayofweek
            new_row['is_weekend'] = 1 if forecast_date.weekday() >= 5 else 0

            # Update lag features: Shift all existing lags and set lag 1 to the prediction
            for t in targets_for_forecasting:
                for lag in range(7, 1, -1):
                    new_row[f'{t}_lag{lag}'] = current_row[f'{t}_lag{lag-1}']
                new_row[f'{t}_lag1'] = pred[t]
            
            # Update rolling features (approximation)
            new_row['temp_max_roll_mean_3'] = np.mean([
                pred['T2M_MAX'], new_row['T2M_MAX_lag2'], new_row['T2M_MAX_lag3']
            ])
            new_row['temp_max_roll_mean_7'] = np.mean([
                pred['T2M_MAX']] + [new_row[f'T2M_MAX_lag{i}'] for i in range(2, 8)]
            )
            new_row['precip_roll_sum_7'] = sum([
                pred['PRECTOTCORR']] + [new_row[f'PRECTOTCORR_lag{i}'] for i in range(2, 8)]
            )
            # Pressure rolling mean approximation (using existing PS value as a proxy)
            new_row['pressure_roll_mean_7'] = new_row['PS']
            
            # Update wind sin/cos based on predicted wind direction
            rad = np.deg2rad(pred['WD10M'])
            new_row['wind_dir_sin'] = np.sin(rad)
            new_row['wind_dir_cos'] = np.cos(rad)

            # Set the new target values for the next iteration's feature calculation
            for t in targets_for_forecasting:
                 new_row[t] = pred[t] 

            current_row = new_row

        return forecasts

    def run_full_pipeline(self, days=7):
        """
        Runs the full sequence: Train models and generate a forecast.
        Returns the forecast data in a visualization-friendly list format.
        """
        self.train_model()
        forecast_data = self.forecast_next_n_days(days)
        return forecast_data

    def get_training_summary(self):
        """
        Provides accessible information about the trained models and their performance.
        """
        return self.training_summary

# --- Demonstration and Output Formatting ---

def ml_forecast_data(city_name="Phnom Penh"):
    """
    This function mimics the user's requested callable format,
    running the OOP system and returning the structured output.
    """
    try:
        # 1. Initialize and run the full pipeline
        system = WeatherForecastSystem(city_name=city_name) 
        forecast_results = system.run_full_pipeline(days=7)
        
        # 2. Get the training metadata
        training_info = system.get_training_summary()
        
        print("\n--- Training Performance Summary ---")
        for target, metrics in training_info.items():
            print(f"  {target}: {metrics}")
        print("------------------------------------")

        return forecast_results

    except FileNotFoundError as e:
        # Handle the specific case where the CSV file is missing
        print(e)
        return []
    except Exception as e:
        print(f"An unexpected error occurred during ML pipeline execution: {e}")
        return []

if __name__ == '__main__':
    # Execute the function to generate and display the results
    forecast_data = ml_forecast_data("Phnom Penh")

    if forecast_data:
        # Print the final result in a clean, tabular format
        print("\n7-DAY WEATHER FORECAST RESULT (Callable Output):")
        
        # Display the results
        print("-" * 100)
        print(f"{'Date':<12} {'Max Temp':>12} {'Min Temp':>12} {'Rain':>9} {'Wind':>12} {'Dir':>10} {'Weather':<20}")
        print("-" * 100)
        
        for item in forecast_data:
            print(f"{item['date']:<12} "
                  f"{item['max_temp']:>11.1f}° "
                  f"{item['min_temp']:>11.1f}° "
                  f"{item['rain_mm']:>8.1f} "
                  f"{item['wind_speed_kmh']:>11.1f} "
                  f"{item['wind_direction']:>10}   "
                  f"{item['description']:<20}")
        
        print("-" * 100)
        print("This output format is ready for visualization parts of your application.")

SUCCESS: Loaded data from 'PP_test5nasa.csv'.
Loaded and cleaned 327 valid days up to 2025-11-23

Training XGBoost models on 320 samples...
  → Training model for T2M_MAX...
     Accuracy on training set: MAE: 0.005°C
  → Training model for T2M_MIN...
     Accuracy on training set: MAE: 0.005°C
  → Training model for T2M...
     Accuracy on training set: MAE: 0.005°C
  → Training model for RH2M...
     Accuracy on training set: MAE: 0.006
  → Training model for WS10M...
     Accuracy on training set: MAE: 0.004
  → Training model for WD10M...
     Accuracy on training set: MAE: 0.014
  → Training model for PRECTOTCORR...
     Accuracy on training set: MAE: 0.007
Training complete! Ready for forecasting.

Generating recursive forecast for the next 7 days...

--- Training Performance Summary ---
  T2M_MAX: MAE: 0.005°C
  T2M_MIN: MAE: 0.005°C
  T2M: MAE: 0.005°C
  RH2M: MAE: 0.006
  WS10M: MAE: 0.004
  WD10M: MAE: 0.014
  PRECTOTCORR: MAE: 0.007
------------------------------------

7-DA