In [None]:
# train.ipynb
# -------------------
# Training notebook for predictive autoscaler

# 1️⃣ Imports
import pandas as pd
from feature_pipeline import create_lags  # your custom feature function
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib

# 2️⃣ Load metrics
data = pd.read_csv('metrics.csv', parse_dates=['timestamp'])
print("✅ Metrics loaded:")
display(data.head())

# 3️⃣ Create lag features
columns_to_lag = ['cpu_usage', 'memory_usage', 'requests_per_second']
data = create_lags(data, columns=columns_to_lag)
print("✅ Lag features created:")
display(data.head())

# 4️⃣ Split features and target
X = data.drop(['timestamp', 'replicas'], axis=1)  # features
y = data['replicas']  # target variable

# Use shuffle=False because this is time series data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

# 5️⃣ Train model
model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)
model.fit(X_train, y_train)
print("✅ Model trained")

# 6️⃣ Evaluate model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on test set: {mse:.4f}")

# 7️⃣ Save model
# Make sure the folder exists (create '../ml_predictor' if needed)
joblib.dump(model, '../ml_predictor/model.pkl')
print("✅ Model saved as '../ml_predictor/model.pkl'")