### MACHINE LEARING USING XGBOOST 
#### TRAIN AND TEST


In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from datetime import timedelta
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

class WeatherForecaster:
    def __init__(self):
        self.models = {}
        self.feature_cols = []
        self.last_row = None
        self.daily_df = None

#data cleaning
    def load_data(self, filepath):
        print(f"Loading data from {filepath}...")
        self.daily_df = pd.read_excel(filepath, skiprows=3) #skip the first 3 role since it just some info and latitute + longtitute
        
        # Fixing Excel serial dates since the given data date is not in the right format
        if pd.api.types.is_numeric_dtype(self.daily_df['time']):
            self.daily_df['time'] = pd.to_datetime(self.daily_df['time'], unit='D', origin='1899-12-30')
        self.daily_df = self.daily_df.set_index('time').sort_index()
        print(f"Loaded {len(self.daily_df)} days up to {self.daily_df.index[-1].date()}")

#FUNCTION TO HELP TO MAKE SURE IT PREDICT MORE ACCURATE
    def _engineer_features(self, df):
        targets = [
            'temperature_2m_max (°C)', 'temperature_2m_min (°C)', 'temperature_2m_mean (°C)',
            'precipitation_sum (mm)', 'wind_speed_10m_max (km/h)', 'wind_gusts_10m_max (km/h)'
        ]

        # Lags: Past 1-7 days
        for col in targets:
            for lag in range(1, 8):
                df[f'{col}_lag{lag}'] = df[col].shift(lag)

        # Rolling stats
        df['temp_max_roll_mean_3'] = df['temperature_2m_max (°C)'].rolling(3).mean()
        df['temp_max_roll_mean_7'] = df['temperature_2m_max (°C)'].rolling(7).mean()
        df['precip_roll_sum_7'] = df['precipitation_sum (mm)'].rolling(7).sum()

        # Date features
        df['month'] = df.index.month
        df['day'] = df.index.day
        df['dayofweek'] = df.index.dayofweek
        df['is_weekend'] = (df.index.dayofweek >= 5).astype(int)

        # Wind direction as circular (sin/cos)
        df['wind_dir_rad'] = np.deg2rad(df['wind_direction_10m_dominant (°)'])
        df['wind_dir_sin'] = np.sin(df['wind_dir_rad'])
        df['wind_dir_cos'] = np.cos(df['wind_dir_rad'])

        return df.dropna()

#TRAINING THE MODEL
    def train(self):
        print("Engineering features...")
        df = self._engineer_features(self.daily_df.copy())

        targets = [
            'temperature_2m_max (°C)', 'temperature_2m_min (°C)', 'temperature_2m_mean (°C)',
            'precipitation_sum (mm)', 'wind_speed_10m_max (km/h)', 'wind_gusts_10m_max (km/h)'
        ]

        self.feature_cols = [c for c in df.columns if c not in targets + ['wind_direction_10m_dominant (°)', 'wind_dir_rad']]

        print("Training models...")
        for target in targets:
            model = xgb.XGBRegressor(
                n_estimators=800, learning_rate=0.03, max_depth=8,
                subsample=0.8, colsample_bytree=0.8,
                reg_alpha=0.1, reg_lambda=1.0,
                random_state=42, n_jobs=-1, tree_method='hist'
            )
            model.fit(df[self.feature_cols], df[target])
            self.models[target] = model
            mae = mean_absolute_error(df[target], model.predict(df[self.feature_cols]))
            print(f"{target}: MAE = {mae:.2f}")

        self.last_row = df.iloc[-1]
        print("Training complete!")

#Predict next 7 days (recursive forecasting)
    def forecast(self, days=7):
        if not self.models:
            raise ValueError("Train the model first with .train()")

        targets = list(self.models.keys())
        
        # Force start from TOMORROW, regardless of last data date
        today = pd.Timestamp.now().normalize()  # Today at 00:00
        start_date = today + pd.Timedelta(days=1)   # Tomorrow
        dates = pd.date_range(start_date, periods=days, freq='D')

        # Use the last known row as base (from training data)
        current = self.last_row.copy()

        forecasts = []

        for i in range(days):
            # Prepare input features
            X_input = current[self.feature_cols].values.reshape(1, -1)
            pred = {}

            # Predict all targets
            for target in targets:
                pred[target] = self.models[target].predict(X_input)[0]

            # Ensure no negative precipitation
            pred['precipitation_sum (mm)'] = max(0, pred['precipitation_sum (mm)'])

            # Carry forward wind direction (or predict if you trained a model for it)
            pred['wind_direction_10m_dominant (°)'] = current['wind_direction_10m_dominant (°)']

            forecasts.append(pred)

            # Create next row for recursive prediction
            new_row = current.copy()

            # Update actual target values with predictions
            for target in targets:
                new_row[target] = pred[target]

            # Set correct date for next iteration
            next_date = dates[i]  # Use actual forecast date
            new_row.name = next_date

            # Update date-based features
            new_row['month'] = next_date.month
            new_row['day'] = next_date.day
            new_row['dayofweek'] = next_date.dayofweek
            new_row['is_weekend'] = 1 if next_date.weekday() >= 5 else 0

            # Update lag features (shift everything down)
            for target in targets:
                for lag in range(7, 1, -1):
                    new_row[f'{target}_lag{lag}'] = current[f'{target}_lag{lag-1}']
                new_row[f'{target}_lag1'] = pred[target]

            # Update rolling features manually
            new_row['temp_max_roll_mean_3'] = np.mean([
                pred['temperature_2m_max (°C)'],
                current['temperature_2m_max (°C)_lag1'],
                current[f'temperature_2m_max (°C)_lag2']
            ])

            new_row['temp_max_roll_mean_7'] = np.mean([
                pred['temperature_2m_max (°C)'],
                current['temperature_2m_max (°C)_lag1'],
                current['temperature_2m_max (°C)_lag2'],
                current['temperature_2m_max (°C)_lag3'],
                current['temperature_2m_max (°C)_lag4'],
                current['temperature_2m_max (°C)_lag5'],
                current['temperature_2m_max (°C)_lag6'],
            ])

            new_row['precip_roll_sum_7'] = (
                pred['precipitation_sum (mm)'] +
                sum(current[f'precipitation_sum (mm)_lag{i}'] for i in range(1, 7))
            )

            # Update wind direction circular features
            rad = np.deg2rad(pred['wind_direction_10m_dominant (°)'])
            new_row['wind_dir_sin'] = np.sin(rad)
            new_row['wind_dir_cos'] = np.cos(rad)

            current = new_row

        # Create final DataFrame
        forecast_df = pd.DataFrame(forecasts, index=dates)
        forecast_df['wind_direction_10m_dominant (°)'] %= 360
        forecast_df = forecast_df.round(1)

        # Add weather description
        desc = []
        for _, row in forecast_df.iterrows():
            p = row['precipitation_sum (mm)']
            t = row['temperature_2m_max (°C)']
            if p < 0.5:
                if t >= 34: desc.append("Very Hot & Sunny")
                elif t >= 32: desc.append("Sunny")
                elif t >= 29: desc.append("Mostly Sunny")
                else: desc.append("Partly Cloudy")
            elif p < 2.0: desc.append("Light Rain")
            elif p < 7.0: desc.append("Rainy")
            elif p < 15.0: desc.append("Heavy Rain")
            else: desc.append("Very Heavy Rain")
        forecast_df['Weather'] = desc

        return forecast_df

    def print_forecast(self, forecast_df):
        """Print the forecast in a clean table"""
        print("\n7-DAY WEATHER FORECAST FOR PHNOM PENH")
        print("="*100)
        print(f"{'Date':<12} {'Max Temp':>10} {'Min Temp':>10} {'Mean Temp':>12} {'Rain':>8} {'Max Wind':>12} {'Max Gust':>12} {'WindDir':>10} {'Weather':<15}")
        print("-"*100)
        for date, row in forecast_df.iterrows():
            print(f"{date.strftime('%Y-%m-%d'):<12} {row['temperature_2m_max (°C)']:>8.1f}°C {row['temperature_2m_min (°C)']:>8.1f}°C {row['temperature_2m_mean (°C)']:>10.1f}°C {row['precipitation_sum (mm)']:>6.1f}mm {row['wind_speed_10m_max (km/h)']:>8.1f}km/h {row['wind_gusts_10m_max (km/h)']:>8.1f}km/h {row['wind_direction_10m_dominant (°)']:>8.0f}° {row['Weather']:<15}")
        print("="*100)


#to use other file just change the file path (but make sure the file have the same format)
if __name__ == "__main__":
    forecaster = WeatherForecaster()
    forecaster.load_data('test3.xlsx')
    forecaster.train()
    forecast = forecaster.forecast(days=7)
    forecaster.print_forecast(forecast)

Loading data from test3.xlsx...
Loaded 326 days up to 2025-11-22
Engineering features...
Training models...
temperature_2m_max (°C): MAE = 0.00
temperature_2m_min (°C): MAE = 0.00
temperature_2m_mean (°C): MAE = 0.00
precipitation_sum (mm): MAE = 0.00
wind_speed_10m_max (km/h): MAE = 0.00
wind_gusts_10m_max (km/h): MAE = 0.01
Training complete!

7-DAY WEATHER FORECAST FOR PHNOM PENH
Date           Max Temp   Min Temp    Mean Temp     Rain     Max Wind     Max Gust    WindDir Weather        
----------------------------------------------------------------------------------------------------
2025-11-27       28.7°C     22.3°C       25.0°C    0.3mm     19.1km/h     43.2km/h       10° Partly Cloudy  
2025-11-28       29.7°C     22.7°C       25.2°C    1.3mm     20.4km/h     47.4km/h       10° Light Rain     
2025-11-29       29.4°C     22.8°C       25.0°C    0.9mm     21.3km/h     48.2km/h       10° Light Rain     
2025-11-30       29.3°C     22.5°C       24.8°C    1.2mm     22.3km/h     48