In [6]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.base import clone

# ---------- Paths ----------
model_paths = {
    "usage_cpu": "../backend/models/usage_cpu_best_model.pkl",
    "usage_storage": "../backend/models/usage_storage_best_model.pkl",
    "users_active": "../backend/models/users_active_best_model.pkl",
}

model_paths_save = {
    "usage_cpu": "../backend/models/backtested_models/usage_cpu_best_model.pkl",
    "usage_storage": "../backend/models/backtested_models/usage_storage_best_model.pkl",
    "users_active": "../backend/models/backtested_models/users_active_best_model.pkl",
}

data_path = "../Data/models/enhanced_features.csv"
results_dir = "./results/"

# ---------- Load data ----------
df = pd.read_csv(data_path)
df["date"] = pd.to_datetime(df["date"])

# ---------- Load models ----------
models = {key: joblib.load(path) for key, path in model_paths.items()}

# ---------- Metrics functions ----------
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1e-8)))

def bias(y_true, y_pred):
    return np.mean(y_pred - y_true)

# ---------- Rolling backtest & fine-tune ----------
def rolling_backtest(df, target, model, window_size=30):
    df = df.sort_values("date").copy()
    metrics_list = []

    feature_cols = [c for c in df.columns if c not in ["date", "usage_cpu", "usage_storage", "users_active", "unique_id"]]

    # Disable early stopping for small rolling windows
    if hasattr(model, "early_stopping_rounds"):
        model.set_params(early_stopping_rounds=None)

    # Clone the model to avoid overwriting original
    temp_model = clone(model)

    for start in range(len(df) - window_size):
        train = df.iloc[: start + window_size]
        test = df.iloc[start + window_size : start + window_size + 1]

        X_train, y_train = train[feature_cols], train[target]
        X_test, y_test = test[feature_cols], test[target]

        # Fine-tune model on training window
        temp_model.fit(X_train, y_train)
        y_pred = temp_model.predict(X_test)

        # Metrics
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mape = mean_absolute_percentage_error(y_test, y_pred)
        b = bias(y_test, y_pred)

        metrics_list.append({
            "date": test["date"].iloc[0].strftime("%Y-%m-%d"),
            "mae": mae,
            "rmse": rmse,
            "mape": mape,
            "bias": b
        })

    return pd.DataFrame(metrics_list), temp_model

# ---------- Run backtest for all targets ----------
all_metrics = {}

for target, model in models.items():
    print(f"Backtesting {target}...")
    metrics_df, fine_tuned_model = rolling_backtest(df, target, model)

    # Save fine-tuned model
    joblib.dump(fine_tuned_model, model_paths_save[target])

    # Save metrics
    metrics_csv_path = f"{results_dir}{target}_backtest_metrics.csv"
    metrics_df.to_csv(metrics_csv_path, index=False)

    all_metrics[target] = metrics_df
    print(f"Saved metrics for {target} to {metrics_csv_path} and fine-tuned model updated.")

# ---------- Example: Forward Forecast (April) ----------
from datetime import timedelta

def forecast_future(df, target, model, days=30):
    df = df.sort_values("date").copy()
    last_date = df["date"].max()
    hist_df = df.copy()
    forecasts = []

    feature_cols = [c for c in df.columns if c not in ["date", "usage_cpu", "usage_storage", "users_active", "unique_id"]]

    for i in range(days):
        next_date = last_date + timedelta(days=i + 1)
        new_row = hist_df.iloc[-1:].copy()
        new_row["date"] = next_date

        # Update lags and rolling stats
        for lag in [1,7,14]:
            new_row[f"{target}_lag_{lag}"] = hist_df[target].iloc[-lag]
        for win in [7,14]:
            roll_vals = hist_df[target].iloc[-win:]
            new_row[f"{target}_roll_mean_{win}"] = roll_vals.mean()
            new_row[f"{target}_roll_std_{win}"] = roll_vals.std(ddof=0)

        X_next = new_row[feature_cols]
        pred = model.predict(X_next)[0]
        new_row[target] = pred

        forecasts.append({"date": next_date.strftime("%Y-%m-%d"), "predicted": float(pred)})
        hist_df = pd.concat([hist_df, new_row], ignore_index=True)

    return pd.DataFrame(forecasts)



AttributeError: module 'scipy.sparse' has no attribute 'linalg'

In [12]:
cpu_forecast = forecast_future(df, "usage_cpu", models["usage_cpu"], days=30)
cpu_forecast.to_csv("./results/cpu_april_forecast.csv", index=False)

In [4]:

model_paths = {
    'usage_storage': '../backend/models/backtested_models/usage_storage_best_model.pkl',
}

models = {name: joblib.load(path) for name, path in model_paths.items()}

# ------------------------------
# Forecast Function for April (Friend-like)
# ------------------------------
def forecast_april_simulated(model, target_col, df, horizon=30, variability_factor=5):
    """
    Forecast April recursively using last 3 months.
    Simulate real fluctuations by extrapolating daily percentage changes.
    """
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    hist_df = df.copy()
    last_date = hist_df['date'].max()

    forecasts = []

    for i in range(horizon):
        next_date = last_date + timedelta(days=i + 1)
        new_row = hist_df.iloc[-1:].copy()
        new_row['date'] = next_date
        new_row['month'] = next_date.month
        new_row['dayofweek'] = next_date.dayofweek
        new_row['dayofmonth'] = next_date.day
        new_row['quarter'] = (next_date.month - 1) // 3 + 1
        new_row['is_weekend'] = 1 if next_date.weekday() >= 5 else 0

        # Propagate lag features with small relative daily change
        for lag in [1, 7, 14]:
            if len(hist_df) >= lag:
                prev_val = hist_df[target_col].iloc[-lag]
                pct_change = np.random.normal(0, variability_factor)  # +/- 3% daily fluctuation
                new_row[f"{target_col}_lag_{lag}"] = prev_val * (1 + pct_change)
            else:
                new_row[f"{target_col}_lag_{lag}"] = hist_df[target_col].mean()

        # Propagate rolling features similarly
        for win in [7, 14]:
            if len(hist_df) >= win:
                prev_window = hist_df[target_col].iloc[-win:]
                pct_changes = np.random.normal(0, variability_factor, size=win)
                adjusted_window = prev_window * (1 + pct_changes)
                new_row[f"{target_col}_roll_mean_{win}"] = adjusted_window.mean()
                new_row[f"{target_col}_roll_std_{win}"] = adjusted_window.std(ddof=0)
            else:
                new_row[f"{target_col}_roll_mean_{win}"] = hist_df[target_col].mean()
                new_row[f"{target_col}_roll_std_{win}"] = hist_df[target_col].std(ddof=0)

        # Predict
        feature_cols = [col for col in hist_df.columns if col not in ['date', 'usage_cpu', 'usage_storage', 'users_active', 'unique_id']]
        X_next = new_row[feature_cols]
        pred = model.predict(X_next)[0]
        new_row[target_col] = pred

        forecasts.append({'date': next_date.strftime('%Y-%m-%d'), 'predicted': float(pred)})

        hist_df = pd.concat([hist_df, new_row], ignore_index=True)

    return forecasts

# ------------------------------
# Example Usage
# ------------------------------
encoded_insights = pd.read_csv('../Data/models/enhanced_features.csv')

april_forecast_cpu = forecast_april_simulated(models['usage_storage'], 'usage_storage', encoded_insights, horizon=30)

for f in april_forecast_cpu:
    print(f)


AttributeError: module 'scipy.sparse' has no attribute 'linalg'

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.base import clone

# ---------- Paths ----------
model_paths = {
    "usage_storage": "../backend/models/usage_storage_best_enhanced_fixed_model.pkl"
}

model_paths_save = {
    "usage_storage": "../backend/models/backtested_models/usage_storage_best_e_model.pkl",
}

data_path = "../Data/models/enhanced_features.csv"
results_dir = "./results/"

# ---------- Load data ----------
df = pd.read_csv(data_path)
df["date"] = pd.to_datetime(df["date"])

# ---------- Load models ----------
models = {key: joblib.load(path) for key, path in model_paths.items()}

# ---------- Metrics functions ----------
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1e-8)))

def bias(y_true, y_pred):
    return np.mean(y_pred - y_true)

# ---------- Rolling backtest & fine-tune ----------
def rolling_backtest(df, target, model, window_size=30):
    df = df.sort_values("date").copy()
    metrics_list = []

    feature_cols = [c for c in df.columns if c not in ["date", "usage_cpu", "usage_storage", "users_active", "unique_id"]]

    # Disable early stopping for small rolling windows
    if hasattr(model, "early_stopping_rounds"):
        model.set_params(early_stopping_rounds=None)

    # Clone the model to avoid overwriting original
    temp_model = clone(model)

    for start in range(len(df) - window_size):
        train = df.iloc[: start + window_size]
        test = df.iloc[start + window_size : start + window_size + 1]

        X_train, y_train = train[feature_cols], train[target]
        X_test, y_test = test[feature_cols], test[target]

        # Fine-tune model on training window
        temp_model.fit(X_train, y_train)
        y_pred = temp_model.predict(X_test)

        # Metrics
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mape = mean_absolute_percentage_error(y_test, y_pred)
        b = bias(y_test, y_pred)

        metrics_list.append({
            "date": test["date"].iloc[0].strftime("%Y-%m-%d"),
            "mae": mae,
            "rmse": rmse,
            "mape": mape,
            "bias": b
        })

    return pd.DataFrame(metrics_list), temp_model

# ---------- Run backtest for all targets ----------
all_metrics = {}

for target, model in models.items():
    print(f"Backtesting {target}...")
    metrics_df, fine_tuned_model = rolling_backtest(df, target, model)

    # Save fine-tuned model
    joblib.dump(fine_tuned_model, model_paths_save[target])

    # Save metrics
    metrics_csv_path = f"{results_dir}{target}_backtest_metrics.csv"
    metrics_df.to_csv(metrics_csv_path, index=False)

    all_metrics[target] = metrics_df
    print(f"Saved metrics for {target} to {metrics_csv_path} and fine-tuned model updated.")

# ---------- Example: Forward Forecast (April) ----------
from datetime import timedelta

def forecast_future(df, target, model, days=30):
    df = df.sort_values("date").copy()
    last_date = df["date"].max()
    hist_df = df.copy()
    forecasts = []

    feature_cols = [c for c in df.columns if c not in ["date", "usage_cpu", "usage_storage", "users_active", "unique_id"]]

    for i in range(days):
        next_date = last_date + timedelta(days=i + 1)
        new_row = hist_df.iloc[-1:].copy()
        new_row["date"] = next_date

        # Update lags and rolling stats
        for lag in [1,7,14]:
            new_row[f"{target}_lag_{lag}"] = hist_df[target].iloc[-lag]
        for win in [7,14]:
            roll_vals = hist_df[target].iloc[-win:]
            new_row[f"{target}_roll_mean_{win}"] = roll_vals.mean()
            new_row[f"{target}_roll_std_{win}"] = roll_vals.std(ddof=0)

        X_next = new_row[feature_cols]
        pred = model.predict(X_next)[0]
        new_row[target] = pred

        forecasts.append({"date": next_date.strftime("%Y-%m-%d"), "predicted": float(pred)})
        hist_df = pd.concat([hist_df, new_row], ignore_index=True)

    return pd.DataFrame(forecasts)



Backtesting usage_storage...
Saved metrics for usage_storage to ./results/usage_storage_backtest_metrics.csv and fine-tuned model updated.
