In [8]:
import numpy as np
import pandas as pd
import pywt
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 1. Utility Functions (Unchanged from Original)
def cyclical_encode(series, period):
    return (np.sin(2 * np.pi * series / period),
            np.cos(2 * np.pi * series / period))

def wavelet_denoise(series, wavelet='db4', level=3):
    coeffs = pywt.wavedec(series, wavelet, mode='per', level=level)
    sigma = np.median(np.abs(coeffs[-level])) / 0.6745
    uthresh = sigma * np.sqrt(2 * np.log(len(series)))
    coeffs[1:] = [pywt.threshold(c, uthresh, mode='soft') for c in coeffs[1:]]
    return pywt.waverec(coeffs, wavelet, mode='per')[:len(series)]

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path, parse_dates=['Date'], dayfirst=True)
    return data.sort_values('Date').reset_index(drop=True)

# 2. Feature Engineering (Unchanged from Original)
def create_features(data):
    # Temporal features
    data['Month_sin'], data['Month_cos'] = cyclical_encode(data['Date'].dt.month, 12)
    data['Day_sin'], data['Day_cos'] = cyclical_encode(data['Date'].dt.day, 31)
    data['Weekday_sin'], data['Weekday_cos'] = cyclical_encode(data['Date'].dt.dayofweek, 7)
    
    # Denoising
    data['Rainfall_log1p'] = np.log1p(data['Rainfall'].values)
    data['Rainfall_denoised'] = wavelet_denoise(data['Rainfall_log1p'].values)
    
    temp_cols = ['MinTemp', 'MaxTemp', '9amTemp', '3pmTemp']
    for col in temp_cols:
        data[f'{col}_denoised'] = wavelet_denoise(data[col].values)
    
    # Lag features
    lag_window = 7
    for col in temp_cols:
        denoised_col = f'{col}_denoised'
        for lag in range(1, lag_window + 1):
            data[f'{denoised_col}_lag{lag}'] = data[denoised_col].shift(lag)
    
    for lag in range(1, lag_window + 1):
        data[f'Rainfall_Rainfall_lag{lag}'] = data['Rainfall_denoised'].shift(lag)
    
    return data.dropna().reset_index(drop=True), temp_cols + ['Rainfall']

# 3. Modified Training and Forecasting with XGBoost
def train_and_forecast(data, target_col, temporal_features, target_cols):
    # Feature setup
    if target_col == 'Rainfall':
        denoised_col = 'Rainfall_denoised'
        features = [f'Rainfall_Rainfall_lag{i}' for i in range(1, 8)]
    else:
        denoised_col = f'{target_col}_denoised'
        features = [f'{denoised_col}_lag{i}' for i in range(1, 8)]
    
    features += temporal_features
    
    # Prepare data
    X = data[features]
    y = data[denoised_col]
    
    # Time-series split
    split = int(0.8 * len(X))
    X_train, X_test = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]
    
    # Create and train XGBoost model
    model = xgb.XGBRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        early_stopping_rounds=20,
        objective='reg:squarederror'
    )
    
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    
    # Generate predictions
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    # Convert from log-scale for Rainfall
    if target_col == 'Rainfall':
        y_train_actual = np.expm1(y_train)
        y_test_actual = np.expm1(y_test)
        train_pred = np.expm1(train_pred)
        test_pred = np.expm1(test_pred)
    else:
        y_train_actual = y_train.values
        y_test_actual = y_test.values
        train_pred = train_pred
        test_pred = test_pred
    
    # Generate 365-day forecast
    forecast_steps = 365
    forecast_dates = pd.date_range(
        start=data['Date'].iloc[-1] + pd.Timedelta(days=1),
        periods=forecast_steps
    )
    
    current_features = X.iloc[-1].copy()
    forecast_values = []
    
    for _ in range(forecast_steps):
        # Make prediction
        pred = model.predict(current_features.values.reshape(1, -1))[0]
        forecast_values.append(pred)
        
        # Update lag features
        if target_col == 'Rainfall':
            for lag in range(6, 0, -1):
                current_features[f'Rainfall_Rainfall_lag{lag+1}'] = current_features[f'Rainfall_Rainfall_lag{lag}']
            current_features['Rainfall_Rainfall_lag1'] = pred
        else:
            for lag in range(6, 0, -1):
                current_features[f'{denoised_col}_lag{lag+1}'] = current_features[f'{denoised_col}_lag{lag}']
            current_features[f'{denoised_col}_lag1'] = pred
        
        # Update cyclical features
        current_date = forecast_dates[len(forecast_values)-1]
        current_features['Month_sin'], current_features['Month_cos'] = cyclical_encode(current_date.month, 12)
        current_features['Day_sin'], current_features['Day_cos'] = cyclical_encode(current_date.day, 31)
        current_features['Weekday_sin'], current_features['Weekday_cos'] = cyclical_encode(current_date.weekday(), 7)
    
    # Convert forecast to original scale
    forecast_actual = np.expm1(forecast_values) if target_col == 'Rainfall' else forecast_values
    
    # Calculate metrics
    metrics = {
        'Train MAE': mean_absolute_error(y_train_actual, train_pred),
        'Test MAE': mean_absolute_error(y_test_actual, test_pred),
        'Train RMSE': np.sqrt(mean_squared_error(y_train_actual, train_pred)),
        'Test RMSE': np.sqrt(mean_squared_error(y_test_actual, test_pred)),
        'Train R²': r2_score(y_train_actual, train_pred),
        'Test R²': r2_score(y_test_actual, test_pred)
    }
    
    # =======================================
    # Plotting (Unchanged from Original)
    # =======================================
    fig = make_subplots(rows=2, cols=1,
                        subplot_titles=(f'Training: {target_col}',
                                        f'Test: {target_col}'))
    
    # Training plot
    date_train = data['Date'].iloc[:split]
    fig.add_trace(go.Scatter(x=date_train, y=y_train_actual, name='Actual (Train)'),
                  row=1, col=1)
    fig.add_trace(go.Scatter(x=date_train, y=train_pred, name='Predicted (Train)'),
                  row=1, col=1)
    
    # Test plot
    date_test = data['Date'].iloc[split:]
    fig.add_trace(go.Scatter(x=date_test, y=y_test_actual, name='Actual (Test)'),
                  row=2, col=1)
    fig.add_trace(go.Scatter(x=date_test, y=test_pred, name='Predicted (Test)'),
                  row=2, col=1)
    
    fig.update_layout(height=600, title_text=f'{target_col} Forecast Performance')
    fig.show()
    
    # =======================================
    # Forecast Plot (Unchanged from Original)
    # =======================================
    forecast_fig = go.Figure()
    forecast_fig.add_trace(go.Scatter(
        x=data['Date'], y=data[target_col], name='Historical'
    ))
    forecast_fig.add_trace(go.Scatter(
        x=forecast_dates, y=forecast_actual, name='Forecast'
    ))
    forecast_fig.update_layout(title=f'{target_col} 1-Year Forecast')
    forecast_fig.show()
    
    # Save results
    train_df = pd.DataFrame({'Date': date_train, 'Actual': y_train_actual, 'Predicted': train_pred})
    test_df = pd.DataFrame({'Date': date_test, 'Actual': y_test_actual, 'Predicted': test_pred})
    forecast_df = pd.DataFrame({'Date': forecast_dates, 'Predicted': forecast_actual})
    combined_df = pd.concat([train_df, test_df, forecast_df])
    combined_df.to_csv(f'xgboost_forecast_{target_col}.csv', index=False)
    
    return model, metrics

# 4. Main Execution (Unchanged from Original)
if __name__ == "__main__":
    file_path = "data/Cleaned_TemperatureRainFall.csv"
    data = load_and_preprocess_data(file_path)
    data, target_cols = create_features(data)
    
    results = {}
    for target in target_cols:
        print(f"\n=== Training {target} ===")
        model, metrics = train_and_forecast(data, target, 
                                           temporal_features=['Month_sin', 'Month_cos',
                                                             'Day_sin', 'Day_cos',
                                                             'Weekday_sin', 'Weekday_cos'],
                                           target_cols=target_cols)
        results[target] = metrics
        print(pd.Series(metrics))
    
    pd.DataFrame(results).T.to_csv('xgboost_forecast_metrics.csv')
    print("\nAll forecasts saved!")


=== Training MinTemp ===


Train MAE     0.039612
Test MAE      0.237053
Train RMSE    0.051263
Test RMSE     0.316129
Train R²      0.999911
Test R²       0.996133
dtype: float64

=== Training MaxTemp ===


Train MAE     0.052610
Test MAE      0.255213
Train RMSE    0.068417
Test RMSE     0.343700
Train R²      0.999909
Test R²       0.997849
dtype: float64

=== Training 9amTemp ===


Train MAE     0.018532
Test MAE      0.219419
Train RMSE    0.024297
Test RMSE     0.293766
Train R²      0.999983
Test R²       0.997379
dtype: float64

=== Training 3pmTemp ===


Train MAE     0.051241
Test MAE      0.244844
Train RMSE    0.066872
Test RMSE     0.326990
Train R²      0.999904
Test R²       0.997815
dtype: float64

=== Training Rainfall ===


Train MAE     0.013680
Test MAE      0.078145
Train RMSE    0.019372
Test RMSE     0.226523
Train R²      0.999556
Test R²       0.949599
dtype: float64

All forecasts saved!


In [19]:
! pip install dash

Collecting dash
  Downloading dash-2.18.2-py3-none-any.whl.metadata (10 kB)
Collecting Flask<3.1,>=1.0.4 (from dash)
  Using cached flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug<3.1 (from dash)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting dash-html-components==2.0.0 (from dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 (from dash)
  Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dash-table==5.0.0 (from dash)
  Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4 kB)
Collecting importlib-metadata (from dash)
  Downloading importlib_metadata-8.6.1-py3-none-any.whl.metadata (4.7 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Collecting itsdangerous>=2.1.2 (from Flask<3.1,>=1.0.4->dash)
  Using cached itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting zipp>=

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.2 which is incompatible.
