In [2]:
pip install xgboost requests pandas numpy datetime scikit-learn

Collecting xgboost
  Using cached xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting datetime
  Using cached DateTime-5.5-py3-none-any.whl.metadata (33 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting zope.interface (from datetime)
  Downloading zope.interface-7.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading DateTime-5.5-py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDo

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

############################################################
# Revised code to avoid issues with older XGBoost versions:
#
# Changes made:
#  - Removed `early_stopping_rounds` and `eval_set` from model.fit()
#    as some older versions of XGBoost do not support these arguments.
#  - Removed passing eval_metric and verbose to fit. Instead, we rely on
#    default parameters and do our own evaluation after training.
#  - Removed deprecated `fillna(method='...')`, using `ffill()` and `bfill()`.
#
# This code:
# 1. Generates synthetic (dummy) data for a variety of factors.
# 2. Creates lag features.
# 3. Trains an XGBoost model using time series split.
# 4. Selects the best model based on RMSE.
# 5. Forecasts for a future 7-day window (with dummy stable conditions).
#
# Note: This is a conceptual demonstration, not a production solution.
############################################################

import pandas as pd
import numpy as np
import datetime
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

########################
# Configuration Section
########################

END_DATE = datetime.date.today()
START_DATE = END_DATE - datetime.timedelta(days=365)
date_range = pd.date_range(start=START_DATE, end=END_DATE, freq='D')
N = len(date_range)

np.random.seed(42)

########################
# Generate Dummy Data
########################

def generate_dummy_df(dates, col_name, mean, std):
    return pd.DataFrame({
        'date': dates,
        col_name: np.random.normal(mean, std, size=len(dates))
    })

# Geological/Environmental:
eq_df = pd.DataFrame({
    'date': date_range,
    'eq_count': np.random.poisson(lam=2, size=N),
    'eq_mean_magnitude': np.random.normal(3, 0.5, size=N)
})
soil_moisture_df = generate_dummy_df(date_range, 'soil_moisture', mean=0.25, std=0.05)
frost_df = generate_dummy_df(date_range, 'frost_depth', mean=0.1, std=0.05)
stream_df = generate_dummy_df(date_range, 'stream_flow', mean=500, std=50)

# Infrastructure:
rail_df = generate_dummy_df(date_range, 'rail_capacity', mean=1000, std=100)
port_df = generate_dummy_df(date_range, 'port_congestion_index', mean=50, std=10)
grid_df = generate_dummy_df(date_range, 'grid_stability', mean=0.95, std=0.02)
maintenance_df = pd.DataFrame({
    'date': date_range,
    'planned_outages_count': np.random.poisson(lam=1, size=N)
})

# Market Indicators:
futures_df = generate_dummy_df(date_range, 'futures_volume', mean=10000, std=2000)
currency_df = generate_dummy_df(date_range, 'currency_strength_index', mean=1.0, std=0.05)
storage_df = generate_dummy_df(date_range, 'storage_level', mean=5000, std=500)
refinery_df = generate_dummy_df(date_range, 'refinery_utilization', mean=0.85, std=0.05)

# Target Fuel Price:
fuel_prices_df = generate_dummy_df(date_range, 'fuel_price', mean=2.5, std=0.3)

########################
# Merge All Data
########################

dfs = [
    eq_df, soil_moisture_df, frost_df, stream_df, rail_df, port_df, grid_df,
    maintenance_df, futures_df, currency_df, storage_df, refinery_df
]

master_df = pd.DataFrame({'date': date_range})
for d in dfs:
    master_df = pd.merge(master_df, d, on='date', how='left')

master_df = pd.merge(master_df, fuel_prices_df, on='date', how='left')

# Fill missing values
master_df = master_df.ffill().bfill()

# Ensure we have target
master_df = master_df.dropna(subset=['fuel_price'])

########################
# Feature Engineering
########################

def create_lag_features(df, cols, lags=[1,7,30]):
    for c in cols:
        for l in lags:
            df[f'{c}_lag{l}'] = df[c].shift(l)
    return df

feature_cols = [c for c in master_df.columns if c not in ['date','fuel_price']]
master_df = create_lag_features(master_df, feature_cols)
master_df = master_df.dropna()

X = master_df.drop(columns=['date','fuel_price'])
y = master_df['fuel_price']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

########################
# Modeling & Training
########################

tscv = TimeSeriesSplit(n_splits=5)
best_rmse = float('inf')
best_model = None

# Since older versions of XGBoost might not accept early_stopping_rounds
# or eval_set, we train simply and evaluate ourselves.
for train_idx, test_idx in tscv.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model = xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
    model.fit(X_train, y_train)  # No eval_set or early_stopping_rounds
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model

print("Best RMSE:", best_rmse)

########################
# Forecasting
########################

future_dates = pd.date_range(start=END_DATE + datetime.timedelta(days=1), periods=7, freq='D')
future_df = pd.DataFrame({'date': future_dates})

# For demonstration, we assume no new data changes and reuse last known state
last_known = master_df.iloc[-1:].copy()

predictions = []
current_state = last_known.copy()

for future_date in future_dates:
    current_row = current_state.drop(columns=['date','fuel_price']).copy()
    current_row_df = pd.DataFrame(current_row.values, columns=current_row.columns)
    current_row_scaled = scaler.transform(current_row_df)

    pred_price = best_model.predict(current_row_scaled)[0]
    predictions.append({'date': future_date, 'predicted_fuel_price': pred_price})

    new_day = current_state.iloc[-1:].copy()
    new_day['date'] = future_date
    new_day['fuel_price'] = pred_price
    # Not realistically updating lag features, but in a real pipeline you would.
    current_state = pd.concat([current_state, new_day], ignore_index=True)

pred_df = pd.DataFrame(predictions)
print(pred_df)


Best RMSE: 0.3067435316325371
        date  predicted_fuel_price
0 2024-12-20              2.566265
1 2024-12-21              2.566265
2 2024-12-22              2.566265
3 2024-12-23              2.566265
4 2024-12-24              2.566265
5 2024-12-25              2.566265
6 2024-12-26              2.566265
