### **MACHINE LEARNING**
 Training and testing historical data to make prediction
 
 Core logic Mainly use XGBoost( eXtreme Gradient Boosting) framework


In [48]:
import pandas as pd
import numpy as np
import xgboost as xgb
from datetime import timedelta
from sklearn.metrics import mean_absolute_error


#Load and Prepare Data
print("Loading PP_Historical_Weather.csv...")
df = pd.read_csv('PP_Historical_Weather.csv', skiprows=16)

# Create date column from YEAR, MO, DY
df = df.rename(columns={'YEAR': 'year', 'MO': 'month', 'DY': 'day'})
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
df = df.set_index('date').sort_index()

#Handle missing values (-999)
df = df.replace(-999, np.nan)
df = df.dropna()  # Drop rows with NaN (last few days are missing)

#Convert wind speed from m/s to km/h (consistent with previous datasets)
df['wind_speed_10m_kmh'] = df['WS10M'] * 3.6

# Targets (weather variables to predict)
targets = [
    'T2M_MAX',      # Max temp (°C)
    'T2M_MIN',      # Min temp (°C)
    'T2M',          # Mean temp (°C)
    'PRECTOTCORR',  # Precip (mm/day)
    'wind_speed_10m_kmh'  # Wind speed (km/h)
]

# Additional features: humidity, pressure, wind direction
df['wind_dir_rad'] = np.deg2rad(df['WD10M'])
df['wind_dir_sin'] = np.sin(df['wind_dir_rad'])
df['wind_dir_cos'] = np.cos(df['wind_dir_rad'])

print(f"Loaded {len(df)} valid days up to {df.index[-1].date()}")

# #load data and clean data 
# def loaddata(filename):
#     print(f'loading data from {filename}')
#     # Try reading with skiprows=17, if columns missing, try without skipping
#     try:
#         df = pd.read_csv(filename, skiprows=16)
#         df.columns = [col.upper() for col in df.columns]  # Standardize column names to uppercase
#         if not all(col in df.columns for col in ['YEAR', 'MO', 'DY']):
#             # Try reading without skipping rows
#             df = pd.read_csv(filename)
#             df.columns = [col.upper() for col in df.columns]
#     except Exception as e:
#         print("Error reading CSV:", e)
#         df = pd.read_csv(filename)
#         df.columns = [col.upper() for col in df.columns]

#     # Check if year, month, day columns exist
#     if not all(col in df.columns for col in ['year', 'month', 'day']):
#         raise KeyError("CSV file must contain 'year', 'month', 'day' columns for date construction.")

#     # Create date column from year, month, day
#     df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
#     df = df.set_index('date').sort_index()

#     # Handle missing values (-999)
#     df = df.replace(-999, np.nan)
#     df = df.dropna()  # Drop rows with NaN (last few day are missing)
#     return df


targets = [
        'T2M_MAX', #max temp 2m in °C
        'T2M_MIN', #min temp 2m in °C
        'T2M', #mean temp
        'RH2M',  #Relative Humidity at 2 Meters (%)
        'WS10M',    #Wind Speed at 10 Meters (m/s)
        'WD10M',   #Wind Direction at 10 Meters (Degrees)
        'PRECTOTCORR' #Precipitation (mm/day)
        ]

def engineer_features(df):
    df = df.copy()
    
    # Lags: Past 1-7 days for all targets
    for col in targets:
        for lag in range(1, 8):
            df[f'{col}_lag{lag}'] = df[col].shift(lag)
    
    # Rolling stats
    df['temp_max_roll_mean_3'] = df['T2M_MAX'].rolling(3).mean()
    df['temp_max_roll_mean_7'] = df['T2M_MAX'].rolling(7).mean()
    df['precip_roll_sum_7'] = df['PRECTOTCORR'].rolling(7).sum()
    df['humidity_roll_mean_7'] = df['RH2M'].rolling(7).mean()  # Add humidity trend
    df['pressure_roll_mean_7'] = df['PS'].rolling(7).mean()    # Pressure trend
    
    # Date features (seasonality)
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['dayofweek'] = df.index.dayofweek
    df['is_weekend'] = (df.index.dayofweek >= 5).astype(int)
    
    # Drop NaN from shifts/rolling
    return df.dropna()

data = engineer_features(df)
    

Loading PP_Historical_Weather.csv...
Loaded 5800 valid days up to 2025-11-17


##### TrainXGBOOST MODEL

In [49]:
feature_cols = [col for col in data.columns 
                if col not in targets + ['WD10M', 'wind_dir_rad', 'WS10M']]  # Exclude raw wind dir, speed

models = {}
print("Training XGBoost models...")
for target in targets:
    print(f"  → Training {target}...")
    model = xgb.XGBRegressor(
        n_estimators=800,
        learning_rate=0.03,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        tree_method='hist'
    )
    model.fit(data[feature_cols], data[target])
    models[target] = model
    
    # Quick accuracy check on full data
    mae = mean_absolute_error(data[target], model.predict(data[feature_cols]))
    print(f"     MAE: {mae:.2f}")

last_row = data.iloc[-1]  # Last valid day for forecasting start
print("Training complete!")

Training XGBoost models...
  → Training T2M_MAX...
     MAE: 0.01
  → Training T2M_MIN...
     MAE: 0.02
  → Training T2M...
     MAE: 0.01
  → Training RH2M...
     MAE: 0.07
  → Training WS10M...
     MAE: 0.00
  → Training WD10M...
     MAE: 0.08
  → Training PRECTOTCORR...
     MAE: 0.05
Training complete!


Test forcast

In [None]:
def forecast_next_7_days(last_row, models, feature_cols, days=7):
    current = last_row.copy()
    dates = pd.date_range(start=current.name + timedelta(days=1), periods=days)
    forecasts = []

    for _ in range(days):
        X = current[feature_cols].values.reshape(1, -1)
        pred = {}
        for target in targets:
            pred[target] = models[target].predict(X)[0]
        pred['PRECTOTCORR'] = max(0, pred['PRECTOTCORR'])  # No negative precip
        
        # Keep wind direction from recent (or predict if model added)
        pred['WD10M'] = current['WD10M']
        
        forecasts.append(pred)
        
        # Update current row for next iteration
        new_row = current.copy()
        for t in targets:
            new_row[t] = pred[t]
        
        next_date = current.name + timedelta(days=1)
        new_row.name = next_date
        
        # Update date features
        new_row['month'] = next_date.month
        new_row['day'] = next_date.day
        new_row['dayofweek'] = next_date.dayofweek
        new_row['is_weekend'] = 1 if next_date.weekday() >= 5 else 0
        
        # Update lags (shift forward)
        for t in targets:
            for lag in range(7, 1, -1):
                new_row[f'{t}_lag{lag}'] = current[f'{t}_lag{lag-1}']
            new_row[f'{t}_lag1'] = pred[t]
        
        # Update rolling features (approximate with recent lags)
        new_row['temp_max_roll_mean_3'] = np.mean([
            pred['T2M_MAX'], current['T2M_MAX_lag1'], current['T2M_MAX_lag2']
        ])
        new_row['temp_max_roll_mean_7'] = np.mean([
            pred['T2M_MAX']] + [current[f'T2M_MAX_lag{i}'] for i in range(1, 7)]
        )
        new_row['precip_roll_sum_7'] = sum([
            pred['PRECTOTCORR']] + [current[f'PRECTOTCORR_lag{i}'] for i in range(1, 7)]
        )
        new_row['humidity_roll_mean_7'] = np.mean([
            current['RH2M']] + [current[f'RH2M_lag{i}'] for i in range(1, 7)]  # Approximate
        )
        new_row['pressure_roll_mean_7'] = np.mean([
            current['PS']] + [current[f'PS_lag{i}'] for i in range(1, 7)]  # Add lags for these if needed
        )

        # Update wind sin/cos
        rad = np.deg2rad(pred['WD10M'])
        new_row['wind_dir_sin'] = np.sin(rad)
        new_row['wind_dir_cos'] = np.cos(rad)
        
        current = new_row

    forecast_df = pd.DataFrame(forecasts, index=dates).round(1)
    
    # Add weather description
    desc = []
    for _, row in forecast_df.iterrows():
        precip = row['PRECTOTCORR']
        temp_max = row['T2M_MAX']
        if precip < 0.5:
            desc.append("Sunny" if temp_max >= 33 else "Mostly Sunny" if temp_max >= 30 else "Partly Cloudy")
        elif precip < 3:
            desc.append("Light Rain")
        elif precip < 10:
            desc.append("Rainy")
        else:
            desc.append("Heavy Rain")
    forecast_df['Weather'] = desc
    
    return forecast_df