# Import library

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math
import matplotlib.pyplot as plt
import joblib
import os
import pickle

# Preprocesssing data

## Read data to dataframe

In [None]:
output_file_path_final = 'service_metrics.xlsx'
df = pd.read_excel(output_file_path_final)

In [None]:
print(df.head())

## Label and create lookback

In [None]:
# Mapping Service Label -> Label
service_mapping = {
    "emailservice": 1,
    "checkoutservice": 2,
    "recommendationservice": 3,
    "frontend": 4,
    "paymentservice": 5,
    "productcatalogservice": 6,
    "cartservice": 7,
    "redis-cart": 8,
    "currencyservice": 9,
    "shippingservice": 10,
    "adservice": 11,
}

# Attach label
df['Service Label Number'] = df['Service Label'].map(service_mapping)

def prepare_dataframe(df, n_steps):
    dataframes = []
    # group by label
    for label, group in df.groupby('Service Label Number'):
        # group by timestamp
        group = group.sort_values('Timestamp')
        group.set_index('Timestamp', inplace=True)

        # Create lockback cols
        for i in range(1, n_steps + 1):
            group[f'Close(t-{i})'] = group['Total CPU (m)'].shift(i)

        # delete NaN value by shift
        group.dropna(inplace=True)
        dataframes.append(group)

    return pd.concat(dataframes)

# Set loockback value
lookback = 5
shifted_df = prepare_dataframe(df, lookback)

## Save data to new_file

In [None]:
# List required cols
required_columns = ['Timestamp', 'Service Label Number', 'Total CPU (m)', 'Close(t-1)', 'Close(t-2)', 'Close(t-3)', 'Close(t-4)', 'Close(t-5)', 'Pod Count']

# Get cols form DataFrame
final_df = shifted_df.reset_index()[required_columns]

# Change name of cols
final_df.columns = ['Timestamp', 'Label', 'CPU(t)', 'CPU(t-1)', 'CPU(t-2)', 'CPU(t-3)', 'CPU(t-4)', 'CPU(t-5)', 'Current Pods']

# Save file to xlsx
df_sorted = final_df.sort_values(by='Timestamp', ascending=True).reset_index(drop=True)
output_file_path_final = 'final_service_metrics.xlsx'
df_sorted.to_excel(output_file_path_final, index=False)

## Read new data to new dataframe

In [None]:
output_file_path_final = '/content/drive/MyDrive/KLTN_2024/Dataset/service_metrics/final_service_metrics.xlsx'
new_df = pd.read_excel(output_file_path_final)

## Feature/Target and splitdata

In [None]:
new_df.set_index(['Timestamp', 'Label'], inplace=True)

# X and y
X = new_df[['CPU(t-1)', 'CPU(t-2)', 'CPU(t-3)', 'CPU(t-4)', 'CPU(t-5)']]
y = new_df['CPU(t)']

# split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Standardize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Save scaler

In [None]:
joblib.dump(scaler, 'data_scaler.pkl')

## Define model ANN

In [None]:
# Hàm xây dựng mô hình ANN
def build_ann_model(hidden_units, learning_rate):
    model = Sequential([
        Dense(hidden_units, input_dim=X_train.shape[1], activation='relu'),
        Dense(hidden_units // 2, activation='relu'),
        Dense(1)  # Output layer
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'])
    return model

## Tuning model

In [None]:
# Tạo đường dẫn lưu model tốt nhất
os.makedirs("best_model", exist_ok=True)
model_path = "best_model/best_ann_model.h5"

# GridSearchCV parameters
param_grid_ann = {
    'hidden_units': [32, 64, 128],
    'learning_rate': [0.01, 0.001],
    'batch_size': [32, 64],
    'epochs': [50]
}

# Wrapper for GridSearchCV
model_ann = KerasRegressor(build_fn=build_ann_model, verbose=1)

# GridSearchCV
grid_ann = GridSearchCV(estimator=model_ann, param_grid=param_grid_ann, scoring='neg_mean_absolute_error', cv=3, verbose=0)
grid_ann.fit(X_train, y_train)

# Find best param
best_params = grid_ann.best_params_
best_score = -grid_ann.best_score_
print(f"Best ANN Params: {best_params}")
print(f"Best ANN Score: {best_score}")

## Save best_model

In [None]:
# Lưu mô hình tốt nhất sau GridSearch
best_model = grid_ann.best_estimator_.model_
best_model.save(model_path)
print(f"Best model saved to {model_path}")

## Train

In [None]:
history_ann = best_model.fit(
    X_train,
    y_train,
    epochs=best_params['epochs'],
    batch_size=best_params['batch_size'],
    validation_data=(X_test, y_test),
    verbose=1,
)

## Evaluate

In [None]:
# Evaluate
loss, mae = best_model.evaluate(X_test, y_test, verbose=0)
print(f"Test MAE: {mae}")

# Trực quan hóa kết quả
import matplotlib.pyplot as plt
plt.plot(history_ann.history['mae'], label='Train MAE')
plt.plot(history_ann.history['val_mae'], label='Validation MAE')
plt.xlabel('Epochs')
plt.ylabel('Mean Absolute Error')
plt.legend()
plt.title('Training and Validation MAE')
plt.show()

In [None]:
# Dự đoán từ mô hình RNN
y_pred_ann = best_model.predict(X_test)

# Tính MAE, MSE, RMSE
mae_ann = mean_absolute_error(y_test, y_pred_ann)
mse_ann = mean_squared_error(y_test, y_pred_ann)
rmse_ann = math.sqrt(mse_ann)

# In kết quả
print(f"RNN - MAE: {mae_ann}")
print(f"RNN - MSE: {mse_ann}")
print(f"RNN - RMSE: {rmse_ann}")

In [None]:
y_pred_ann = model_ann.predict(X_test)

import matplotlib.pyplot as plt
# Plot Actual vs Predicted (ANN)
plt.figure(figsize=(12, 6))
plt.plot(y_test.values[:200], label='Actual', color='blue')
plt.plot(y_pred_ann[:200], label='ANN Predicted', linestyle='--', color='orange')
plt.title('Actual vs ANN dự đoán')
plt.xlabel('Mẫu')
plt.ylabel('CPU sử dụng')
plt.legend()
plt.grid()
plt.show()