In [None]:
import pandas as pd
import pyodbc
import numpy as np
import joblib
import warnings
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
from pathlib import Path
warnings.filterwarnings("ignore")

# ---------------------------
# Load Data from SQL Server
# ---------------------------
def load_growth_data():
    df=pd.read_csv("C:/Users/vpotla/Desktop/Testing-SQL-Server/data.csv")
    # Create Date column
    df["Date"] = pd.to_datetime(df["yr"].astype(str) + "-" + df["mn"].astype(str) + "-01")
    df = df.sort_values("Date")
    
    # Aggregate (server+db usage % over time)
    group = df.groupby("Date")["per"].mean().reset_index()
    group = group.set_index("Date").asfreq("MS")  # monthly series
    group["per"] = group["per"].fillna(method="ffill")
    
    return group


# ---------------------------
# Model Selection (ARIMA)
# ---------------------------
def select_best_arima(series, p_range=(0, 8), d_range=(0, 3), q_range=(0, 8)):
    best_aic = float("inf")
    best_order = None
    best_model = None
    
    for p in range(*p_range):
        for d in range(*d_range):
            for q in range(*q_range):
                try:
                    model = ARIMA(series, order=(p, d, q)).fit()
                    if model.aic < best_aic:
                        best_aic = model.aic
                        best_order = (p, d, q)
                        best_model = model
                except:
                    continue
    
    return best_model, best_order, best_aic


# ---------------------------
# Save Only Model Params + Metadata
# ---------------------------
def save_model_metadata(model, order, aic, mse, mae, filename="best_model_meta.pkl"):
    metadata = {
        "order": order,
        "aic": aic,
        "mse": mse,
        "mae": mae,
        "params": model.params.to_dict()  # save only coefficients
    }
    joblib.dump(metadata, filename)
    print(f"Model metadata saved as {filename}")


# ---------------------------
# Main Execution
# ---------------------------
if __name__ == "__main__":
    # Load data
    df = load_growth_data()
    series = df["per"]

    # Split Train/Test (last 12 months as test)
    train_size = int(len(series) * 0.8)
    train, test = series.iloc[:train_size], series.iloc[train_size:]
    
    # Run ARIMA selection
    best_model, best_order, best_aic = select_best_arima(
        train, p_range=(0, 13), d_range=(0, 4), q_range=(0, 13)
    )
    print(f"Best ARIMA order={best_order}, AIC={best_aic}")
    
    # Forecast on test set
    forecast = best_model.forecast(steps=len(test))
    mse = mean_squared_error(test, forecast)
    mae = mean_absolute_error(test, forecast)
    print(f"Test MSE={mse}, MAE={mae}")
    
    # Save only metadata (not full training data)
    save_model_metadata(best_model, best_order, best_aic, mse, mae, "best_arima_meta2.pkl")


In [20]:
import pandas as pd
import pyodbc
import numpy as np
import joblib
import warnings
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
from pathlib import Path
warnings.filterwarnings("ignore")

# ---------------------------
# Load Data from SQL Server
# ---------------------------
def load_growth_data():
    df=pd.read_csv("C:/Users/vpotla/Desktop/Testing-SQL-Server/data.csv")
    # Create Date column
    df["Date"] = pd.to_datetime(df["Year"].astype(str) + "-" + df["Month"].astype(str) + "-01")
    df = df.sort_values("Date")
    
    # Aggregate (server+db usage % over time)
    group = df.groupby("Date")["Growth%"].mean().reset_index()
    group = group.set_index("Date").asfreq("MS")  # monthly series
    group["Growth%"] = group["Growth%"].fillna(method="ffill")
    
    return group


# ---------------------------
# Model Selection (ARIMA)
# ---------------------------
def select_best_arima(series, p_range=(0, 8), d_range=(0, 3), q_range=(0, 8)):
    best_bic = float("inf")
    best_order = None
    best_model = None
    
    for p in range(*p_range):
        for d in range(*d_range):
            for q in range(*q_range):
                try:
                    model = ARIMA(series, order=(p, d, q)).fit()
                    if model.bic < best_bic:
                        best_bic = model.bic
                        best_order = (p, d, q)
                        best_model = model
                except:
                    continue
    
    return best_model, best_order, best_bic


# ---------------------------
# Save Only Model Params + Metadata
# ---------------------------
def save_model_metadata(model, order, bic, mse, mae, filename="best_model_meta.pkl"):
    metadata = {
        "order": order,
        "bic": bic,
        "mse": mse,
        "mae": mae,
        "params": model.params.to_dict()  # save only coefficients
    }
    joblib.dump(metadata, filename)
    print(f"Model metadata saved as {filename}")


# ---------------------------
# Main Execution
# ---------------------------
if __name__ == "__main__":
    # Load data
    df = load_growth_data()
    series = df["Growth%"]

    # Split Train/Test (last 12 months as test)
    train_size = int(len(series) * 0.8)
    train, test = series.iloc[:train_size], series.iloc[train_size:]
    
    # Run ARIMA selection
    best_model, best_order, best_bic = select_best_arima(
        train, p_range=(0, 13), d_range=(0, 4), q_range=(0, 13)
    )
    print(f"Best ARIMA order={best_order}, BIC={best_bic}")
    
    # Forecast on test set
    forecast = best_model.forecast(steps=len(test))
    mse = mean_squared_error(test, forecast)
    mae = mean_absolute_error(test, forecast)
    print(f"Test MSE={mse}, MAE={mae}")
    
    # Save only metadata (not full training data)
    save_model_metadata(best_model, best_order, best_bic, mse, mae, "best_arima_meta_3.pkl")


Best ARIMA order=(8, 1, 0), BIC=-44.4686963471009
Test MSE=0.009488723926647888, MAE=0.07939722283935159
Model metadata saved as best_arima_meta_3.pkl


In [16]:
import pandas as pd
import numpy as np
import joblib
import warnings
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error

warnings.filterwarnings("ignore")

# ---------------------------
# Load CSV Data
# ---------------------------
def load_growth_data(file_path):
    df = pd.read_csv(file_path)
    # Create a Date column
    df["Date"] = pd.to_datetime(df["Year"].astype(str) + "-" + df["Month"].astype(str) + "-01")
    df = df.sort_values("Date")
    return df

# ---------------------------
# Select Best ARIMA per series
# ---------------------------
def select_best_arima(ts, p_range=(0,11), d_range=(0,4), q_range=(0,11)):
    best_aic = float("inf")
    best_order = None
    best_model = None
    
    for p in range(*p_range):
        for d in range(*d_range):
            for q in range(*q_range):
                try:
                    model = ARIMA(ts, order=(p,d,q)).fit()
                    if model.aic < best_aic:
                        best_aic = model.aic
                        best_order = (p,d,q)
                        best_model = model
                except:
                    continue
    return best_model, best_order, best_aic

# ---------------------------
# Forecast Growth% for a series
# ---------------------------
def forecast_growth(ts, months_ahead=6):
    ts = ts.set_index("Date")["Growth%"].asfreq("MS").fillna(method="ffill")
    
    if len(ts) < 12:  # Not enough data
        return None, None, None, None, None, None
    
    # Train/Test split
    train_size = int(len(ts) * 0.8)
    train, test = ts.iloc[:train_size], ts.iloc[train_size:]
    
    model, order, aic = select_best_arima(train)
    forecast = model.forecast(steps=months_ahead)
    
    if len(test) > 0:
        pred_test = model.forecast(steps=len(test))
        mse = mean_squared_error(test, pred_test)
        mae = mean_absolute_error(test, pred_test)
    else:
        mse = mae = None
    
    return forecast, model, order, aic, mse, mae

# ---------------------------
# Save Metadata
# ---------------------------
def save_model_metadata(model, server, database, order, aic, mse, mae):
    metadata = {
        "server": server,
        "database": database,
        "order": order,
        "aic": aic,
        "mse": mse,
        "mae": mae,
        "params": model.params.to_dict()
    }
    filename = f"{server}_{database}_arima_meta.pkl"
    joblib.dump(metadata, filename)
    print(f"Saved metadata: {filename}")

# ---------------------------
# Main Execution
# ---------------------------
def run_forecast(file_path, selected_server="All", selected_db="All", months_ahead=6):
    df = load_growth_data(file_path)
    results = []

    if selected_server.lower() == "all" and selected_db.lower() == "all":
        # Loop through all servers
        for server in df["ServerName"].unique():
            df_server = df[df["ServerName"]==server]
            # Aggregate all databases per server
            df_agg = df_server.groupby("Date", as_index=False).agg({"Growth%":"mean","Size_Used":"sum"})
            df_agg["ServerName"] = server
            df_agg["DatabaseName"] = "All Databases"

            forecast, model, order, aic, mse, mae = forecast_growth(df_agg, months_ahead)
            if forecast is not None:
                save_model_metadata(model, server, "All Databases", order, aic, mse, mae)
                results.append((server, "All Databases", forecast))

    elif selected_server.lower() != "all" and selected_db.lower() == "all":
        # One server, all its databases
        df_server = df[df["ServerName"]==selected_server]
        for db in df_server["DatabaseName"].unique():
            df_db = df_server[df_server["DatabaseName"]==db]
            forecast, model, order, aic, mse, mae = forecast_growth(df_db, months_ahead)
            if forecast is not None:
                save_model_metadata(model, selected_server, db, order, aic, mse, mae)
                results.append((selected_server, db, forecast))

    else:
        # One server, one database
        df_db = df[(df["ServerName"]==selected_server) & (df["DatabaseName"]==selected_db)]
        forecast, model, order, aic, mse, mae = forecast_growth(df_db, months_ahead)
        if forecast is not None:
            save_model_metadata(model, selected_server, selected_db, order, aic, mse, mae)
            results.append((selected_server, selected_db, forecast))
    
    return results

# ---------------------------
# Example Usage
# ---------------------------
if __name__ == "__main__":
    file_path = "C:/Users/vpotla/Desktop/Testing-SQL-Server/data.csv"
    forecasts = run_forecast(file_path, selected_server="Server1", selected_db="DB4", months_ahead=6)
    for server, db, forecast in forecasts:
        print(f"\nServer={server}, Database={db}")
        print(f"Forecast:\n{forecast}")
    forecasts = run_forecast(file_path, selected_server="Server1", selected_db="DB1", months_ahead=6)
    for server, db, forecast in forecasts:
        print(f"\nServer={server}, Database={db}")
        print(f"Forecast:\n{forecast}")
    forecasts = run_forecast(file_path, selected_server="Server1", selected_db="DB2", months_ahead=6)
    for server, db, forecast in forecasts:
        print(f"\nServer={server}, Database={db}")
        print(f"Forecast:\n{forecast}")
    forecasts = run_forecast(file_path, selected_server="Server1", selected_db="DB3", months_ahead=6)
    forecasts=run_forecast(file_path, months_ahead=6)
    for server, db, forecast in forecasts:
        print(f"\nServer={server}, Database={db}")
        print(f"Forecast:\n{forecast}")


Saved metadata: Server1_DB4_arima_meta.pkl

Server=Server1, Database=DB4
Forecast:
2024-09-01    0.444775
2024-10-01    0.477573
2024-11-01    0.225528
2024-12-01    0.685664
2025-01-01    1.002587
2025-02-01    0.536390
Freq: MS, Name: predicted_mean, dtype: float64
Saved metadata: Server1_DB1_arima_meta.pkl

Server=Server1, Database=DB1
Forecast:
2024-09-01    0.201671
2024-10-01    0.241726
2024-11-01    0.261076
2024-12-01    0.287871
2025-01-01    0.356677
2025-02-01    0.306891
Freq: MS, Name: predicted_mean, dtype: float64
Saved metadata: Server1_DB2_arima_meta.pkl

Server=Server1, Database=DB2
Forecast:
2024-09-01    0.549140
2024-10-01    0.502093
2024-11-01    0.199159
2024-12-01    0.457496
2025-01-01    0.424796
2025-02-01    0.213489
Freq: MS, Name: predicted_mean, dtype: float64
Saved metadata: Server1_DB3_arima_meta.pkl
Saved metadata: Server1_All Databases_arima_meta.pkl

Server=Server1, Database=All Databases
Forecast:
2024-08-01    0.587667
2024-09-01    0.457128
2024

In [15]:
import pandas as pd
import numpy as np
import joblib
import warnings
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
from pathlib import Path

warnings.filterwarnings("ignore")

# ---------------------------
# Load CSV Data
# ---------------------------
def load_growth_data(file_path):
    df = pd.read_csv(file_path)
    df["Date"] = pd.to_datetime(df["Year"].astype(str) + "-" + df["Month"].astype(str) + "-01")
    df = df.sort_values("Date")
    return df

# ---------------------------
# Select Best ARIMA per series
# ---------------------------
def select_best_arima(ts, p_range=(0,4), d_range=(0,2), q_range=(0,4)):
    best_aic = float("inf")
    best_order = None
    best_model = None

    for p in range(*p_range):
        for d in range(*d_range):
            for q in range(*q_range):
                try:
                    model = ARIMA(ts, order=(p,d,q)).fit()
                    if model.aic < best_aic:
                        best_aic = model.aic
                        best_order = (p,d,q)
                        best_model = model
                except:
                    continue
    return best_model, best_order, best_aic

# ---------------------------
# Forecast Function
# ---------------------------
def forecast_growth(df, months_ahead=6):
    """
    df: DataFrame containing 'Date', 'Growth%', 'Size_Used', 'ServerName', 'DatabaseName'
    Returns: forecast_df (with historical+forecast) and metadata dict
    """
    forecast_list = []
    metadata_list = []

    # Aggregate combinations
    servers = df['ServerName'].unique()
    for server in servers:
        df_server = df[df['ServerName']==server]

        # Decide databases to process
        databases = df_server['DatabaseName'].unique()
        for db in databases:
            df_db = df_server[df_server['DatabaseName']==db].sort_values("Date").copy()

            # Skip if not enough data
            if len(df_db) < 12:
                print(f"Skipping {server}-{db}: not enough data")
                continue

            # Prepare time series
            ts = df_db.set_index("Date")["Growth%"].asfreq("MS").fillna(method="ffill")

            # Train/Test split
            train_size = int(len(ts)*0.8)
            train, test = ts.iloc[:train_size], ts.iloc[train_size:]

            # Select ARIMA
            model, order, aic = select_best_arima(train)

            # Forecast
            forecast = model.forecast(steps=months_ahead)

            # Calculate metrics on test
            if len(test) > 0:
                pred_test = model.forecast(steps=len(test))
                mse = mean_squared_error(test, pred_test)
                mae = mean_absolute_error(test, pred_test)
            else:
                mse = mae = None

            # Save metadata
            metadata = {
                "server": server,
                "database": db,
                "order": order,
                "aic": aic,
                "mse": mse,
                "mae": mae,
                "params": model.params.to_dict()
            }
            filename = Path(f"{server}_{db}_arima_meta.pkl")
            joblib.dump(metadata, filename)
            metadata_list.append(metadata)

            # Build forecast DataFrame
            future_idx = pd.date_range(ts.index[-1]+pd.offsets.MonthBegin(1), periods=months_ahead, freq="MS")
            forecast_df = pd.DataFrame({
                "Date": future_idx,
                "Growth%": forecast.values,
                "ServerName": server,
                "DatabaseName": db,
                "Type": "Forecast"
            })
            forecast_df["Year"] = forecast_df["Date"].dt.year
            forecast_df["Month"] = forecast_df["Date"].dt.month

            # Compute Size_Used
            last_used = df_db["Size_Used"].iloc[-1]
            used_forecast, current_used = [], last_used
            for perc in forecast_df["Growth%"]:
                growth_value = (perc/100.0)*current_used
                current_used += growth_value
                used_forecast.append(current_used)
            forecast_df["Size_Used"] = used_forecast

            # Combine with historical
            combined_df = pd.concat([df_db.assign(Type="Historical"), forecast_df])
            forecast_list.append(combined_df)

        # Optional: Aggregate all databases for this server
        df_server_agg = df_server.groupby("Date", as_index=False).agg({"Growth%":"mean","Size_Used":"sum"})
        df_server_agg["ServerName"] = server
        df_server_agg["DatabaseName"] = "All Databases"
        if len(df_server_agg) >= 12:
            ts_agg = df_server_agg.set_index("Date")["Growth%"].asfreq("MS").fillna(method="ffill")
            train_size = int(len(ts_agg)*0.8)
            train_agg = ts_agg.iloc[:train_size]
            model_agg, order_agg, aic_agg = select_best_arima(train_agg)
            forecast_agg = model_agg.forecast(steps=months_ahead)
            # Metrics
            test_agg = ts_agg.iloc[train_size:]
            if len(test_agg) > 0:
                pred_test = model_agg.forecast(steps=len(test_agg))
                mse = mean_squared_error(test_agg, pred_test)
                mae = mean_absolute_error(test_agg, pred_test)
            else:
                mse = mae = None
            # Save metadata
            metadata = {
                "server": server,
                "database": "All Databases",
                "order": order_agg,
                "aic": aic_agg,
                "mse": mse,
                "mae": mae,
                "params": model_agg.params.to_dict()
            }
            joblib.dump(metadata, Path(f"{server}_All_Databases_arima_meta.pkl"))
            metadata_list.append(metadata)
            # Forecast DF
            future_idx = pd.date_range(ts_agg.index[-1]+pd.offsets.MonthBegin(1), periods=months_ahead, freq="MS")
            forecast_df = pd.DataFrame({
                "Date": future_idx,
                "Growth%": forecast_agg.values,
                "ServerName": server,
                "DatabaseName": "All Databases",
                "Type": "Forecast"
            })
            forecast_df["Year"] = forecast_df["Date"].dt.year
            forecast_df["Month"] = forecast_df["Date"].dt.month
            last_used = df_server_agg["Size_Used"].iloc[-1]
            used_forecast, current_used = [], last_used
            for perc in forecast_df["Growth%"]:
                growth_value = (perc/100.0)*current_used
                current_used += growth_value
                used_forecast.append(current_used)
            forecast_df["Size_Used"] = used_forecast
            combined_df = pd.concat([df_server_agg.assign(Type="Historical"), forecast_df])
            forecast_list.append(combined_df)

    # Optional: Aggregate all servers + all databases
    df_total_agg = df.groupby("Date", as_index=False).agg({"Growth%":"mean","Size_Used":"sum"})
    df_total_agg["ServerName"] = "All Servers"
    df_total_agg["DatabaseName"] = "All Databases"
    if len(df_total_agg) >= 12:
        ts_total = df_total_agg.set_index("Date")["Growth%"].asfreq("MS").fillna(method="ffill")
        train_size = int(len(ts_total)*0.8)
        train_total = ts_total.iloc[:train_size]
        model_total, order_total, aic_total = select_best_arima(train_total)
        forecast_total = model_total.forecast(steps=months_ahead)
        # Metrics
        test_total = ts_total.iloc[train_size:]
        if len(test_total) > 0:
            pred_test = model_total.forecast(steps=len(test_total))
            mse = mean_squared_error(test_total, pred_test)
            mae = mean_absolute_error(test_total, pred_test)
        else:
            mse = mae = None
        # Save metadata
        metadata = {
            "server": "All Servers",
            "database": "All Databases",
            "order": order_total,
            "aic": aic_total,
            "mse": mse,
            "mae": mae,
            "params": model_total.params.to_dict()
        }
        joblib.dump(metadata, Path(f"All_Servers_All_Databases_arima_meta.pkl"))
        metadata_list.append(metadata)
        # Forecast DF
        future_idx = pd.date_range(ts_total.index[-1]+pd.offsets.MonthBegin(1), periods=months_ahead, freq="MS")
        forecast_df = pd.DataFrame({
            "Date": future_idx,
            "Growth%": forecast_total.values,
            "ServerName": "All Servers",
            "DatabaseName": "All Databases",
            "Type": "Forecast"
        })
        forecast_df["Year"] = forecast_df["Date"].dt.year
        forecast_df["Month"] = forecast_df["Date"].dt.month
        last_used = df_total_agg["Size_Used"].iloc[-1]
        used_forecast, current_used = [], last_used
        for perc in forecast_df["Growth%"]:
            growth_value = (perc/100.0)*current_used
            current_used += growth_value
            used_forecast.append(current_used)
        forecast_df["Size_Used"] = used_forecast
        combined_df = pd.concat([df_total_agg.assign(Type="Historical"), forecast_df])
        forecast_list.append(combined_df)

    return forecast_list, metadata_list

# ---------------------------
# Main Execution Example
# ---------------------------
if __name__ == "__main__":
    file_path = "C:/Users/vpotla/Desktop/Testing-SQL-Server/data.csv"
    df = load_growth_data(file_path)
    forecasts, metadata = forecast_growth(df, months_ahead=6)
    print("Forecasts and metadata generated for all selections.")


Forecasts and metadata generated for all selections.
