# Preprocessing data

## Filter file

In [None]:
import glob
import csv
import os
import pandas as pd

path = '/content/drive/MyDrive/KLTN_2024/Dataset/fastStorage/*.csv'
output_path = "/content/drive/MyDrive/KLTN_2024/Dataset/folder_fast/"

count = 0
for fname in glob.glob(path):
    with open(fname, 'r') as infh:
        next(infh)
        reader = csv.reader(infh, delimiter=';')

        label = fname.split('/')[-1].split('.')[0]

        timestamp_list = []
        cpu_core_list = []
        cpu_usage_list = []

        for row in reader:
            timestamp = int(row[0])
            cpu_cores = int(row[1])
            cpu_usage = float(row[4])

            timestamp_list.append(timestamp)
            cpu_core_list.append(cpu_cores)
            cpu_usage_list.append(cpu_usage)

        print(cpu_core_list[0])

        if (cpu_core_list[0] == 0):
            print(f"Skip this {fname} file because CPU core = 0")
            continue  

        # Create a DataFrame
        df = pd.DataFrame({
            'Timestamp': timestamp_list,
            'CPU Cores': cpu_core_list,
            'CPU Usage (%)': cpu_usage_list,
            'Label': [label] * len(timestamp_list)
        })

        df['CPU (m)'] = df['CPU Usage (%)'] * df['CPU Cores'] * 10
        df.drop('CPU Usage (%)', axis=1, inplace=True)
        df.drop('CPU Cores', axis=1, inplace=True)

        output = open(output_path + fname.split('/')[-1], "w+")
        df.to_csv(output, index=False)
        count += 1
        print(f"Data saved to {output}")
print('Sum of files hanled: ' + str(count))

## Concat all file to one

In [None]:
import os
import pandas as pd
import glob

output_path = "/content/drive/MyDrive/KLTN_2024/Dataset/folder_fast/"

# list and sort csv file
csv_files = sorted(glob.glob(os.path.join(output_path, '*.csv')))
print(f"Found {len(csv_files)} file CSV.")

# Read and concat all csv file to a Data Frame
df_list = [pd.read_csv(file) for file in csv_files]
combined_df = pd.concat(df_list, ignore_index=True)

# Save new data to file
combined_df.to_csv('/content/drive/MyDrive/KLTN_2024/Dataset/1147_vm_metric.csv', index=False)

# Check data
print(combined_df.head())
print(f"Data has {combined_df.shape[0]} rows và {combined_df.shape[1]} cols.")

## Sort and create history data 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

file_path = '/content/drive/MyDrive/KLTN_2024/Dataset/1147_vm_metric.csv'

df = pd.read_csv(file_path)

def prepare_dataframe(df, n_steps):
    dataframes = []
    # group by label
    for label, group in df.groupby('Label'):
        # group by timestamp
        group = group.sort_values('Timestamp')
        group.set_index('Timestamp', inplace=True)

        # Create lockback cols
        for i in range(1, n_steps + 1):
            group[f'Close(t-{i})'] = group['CPU (m)'].shift(i)

        # delete NaN value by shift
        group.dropna(inplace=True)
        dataframes.append(group)

    return pd.concat(dataframes)

# Set loockback value
lookback = 5
shifted_df = prepare_dataframe(df, lookback)

# List required cols
required_columns = ['Timestamp', 'Label', 'CPU (m)', 'Close(t-1)', 'Close(t-2)', 'Close(t-3)', 'Close(t-4)', 'Close(t-5)']

# Get cols form DataFrame
final_df = shifted_df.reset_index()[required_columns]

# Change name of cols
final_df.columns = ['Timestamp', 'Label', 'CPU(t)', 'CPU(t-1)', 'CPU(t-2)', 'CPU(t-3)', 'CPU(t-4)', 'CPU(t-5)']

# Save file to xlsx
df_sorted = final_df.sort_values(by='Timestamp', ascending=True).reset_index(drop=True)
output_file_path_final = '/content/drive/MyDrive/KLTN_2024/Dataset/final_data_vm_metric.csv'
df_sorted.to_csv(output_file_path_final, index=False)

## Set index

In [None]:
final_df.set_index(['Timestamp', 'Label'], inplace=True)

## Feature/Target and split datasets

In [None]:
# X and y
X = final_df[['CPU(t-1)', 'CPU(t-2)', 'CPU(t-3)', 'CPU(t-4)', 'CPU(t-5)']]
y = final_df['CPU(t)']

# split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

## Scaler

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Reshape input for lstm ann

In [None]:
X_train_reshape = np.expand_dims(X_train, axis=1)
X_test_reshape = np.expand_dims(X_test, axis=1)

# Training and Tuning

## Define model

In [None]:
!pip install scikeras
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SimpleRNN, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import GridSearchCV

def build_ann_model(hidden_units, learning_rate):
    model = Sequential([
        Dense(hidden_units, input_dim=X_train.shape[1], activation='relu'),
        Dense(hidden_units // 2, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'])
    return model

def build_lstm_model(hidden_units, learning_rate):
    model = Sequential([
        LSTM(hidden_units, input_shape=(X_train_reshape.shape[1], X_train_reshape.shape[2]), return_sequences=True),
        Dropout(0.2),
        LSTM(hidden_units // 2, return_sequences=False),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'])
    return model

def build_rnn_model(hidden_units, learning_rate):
    model = Sequential([
        SimpleRNN(hidden_units, input_shape=(X_train_reshape.shape[1], X_train_reshape.shape[2]), return_sequences=True),
        Dropout(0.2),
        SimpleRNN(hidden_units // 2, return_sequences=False),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'])
    return model

## ANN => [64, 0.01, 32, 50]

In [None]:
param_grid_ann = {
    'hidden_units': [32, 64, 128],
    'learning_rate': [0.01, 0.001],
    'batch_size': [32, 64],
    'epochs': [20, 50]
}

# Wrapper for ANN
model_ann = KerasRegressor(build_fn=build_ann_model, verbose=1, **param_grid_ann)
grid_ann = GridSearchCV(estimator=model_ann, param_grid=param_grid_ann, scoring='neg_mean_absolute_error', cv=3)
grid_ann.fit(X_train, y_train)
print(f"Best ANN Params: {grid_ann.best_params_}")
print(f"Best ANN Score: {-grid_ann.best_score_}")

## RNN => [64, 0.001, 64, 50]

In [None]:
param_grid_rnn = {
    'hidden_units': [32, 64],
    'learning_rate': [0.01, 0.001],
    'batch_size': [32, 64],
    'epochs': [20, 50]
}

# Wrapper for RNN
model_rnn = KerasRegressor(build_fn=build_rnn_model, verbose=1, **param_grid_rnn)
grid_rnn = GridSearchCV(estimator=model_rnn, param_grid=param_grid_rnn, scoring='neg_mean_absolute_error', cv=3)
grid_rnn.fit(X_train_reshape, y_train)
print(f"Best RNN Params: {grid_rnn.best_params_}")
print(f"Best RNN Score: {-grid_rnn.best_score_}")

## LSTM => [64, 0.001, 64, 20]

In [None]:
param_grid_lstm = {
    'hidden_units': [32, 64],
    'learning_rate': [0.01, 0.001],
    'batch_size': [32, 64],
    'epochs': [20, 50]
}

model_lstm = KerasRegressor(build_fn=build_lstm_model, verbose=1, **param_grid_lstm)
grid_lstm = GridSearchCV(estimator=model_lstm, param_grid=param_grid_lstm, scoring='neg_mean_absolute_error', cv=3)
grid_lstm.fit(X_train_reshape, y_train)
print(f"Best LSTM Params: {grid_lstm.best_params_}")
print(f"Best LSTM Score: {-grid_lstm.best_score_}")

# Evaluation

In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math

y_pred_rnn = model_rnn.predict(X_test_reshape)

mae_rnn = mean_absolute_error(y_test, y_pred_rnn)
mse_rnn = mean_squared_error(y_test, y_pred_rnn)
rmse_rnn = math.sqrt(mse_rnn)

print(f"RNN - MAE: {mae_rnn}")
print(f"RNN - MSE: {mse_rnn}")
print(f"RNN - RMSE: {rmse_rnn}")

In [None]:
y_pred_ann = model_ann.predict(X_test)

mae_ann = mean_absolute_error(y_test, y_pred_ann)
mse_ann = mean_squared_error(y_test, y_pred_ann)
rmse_ann = math.sqrt(mse_ann)

print(f"ANN - MAE: {mae_ann}")
print(f"ANN - MSE: {mse_ann}")
print(f"ANN - RMSE: {rmse_ann}")

In [None]:
y_pred_lstm = model_lstm.predict(X_test_reshape)

mae_lstm = mean_absolute_error(y_test, y_pred_lstm)
mse_lstm = mean_squared_error(y_test, y_pred_lstm)
rmse_lstm = math.sqrt(mse_lstm)

print(f"LSTM - MAE: {mae_lstm}")
print(f"LSTM - MSE: {mse_lstm}")
print(f"LSTM - RMSE: {rmse_lstm}")


# Visualize

## Visualize MAE MSE RMSE

In [None]:
metrics = ['MAE', 'MSE', 'RMSE']
rnn_metrics = [mae_rnn, mse_rnn, rmse_rnn]
ann_metrics = [mae_ann, mse_ann, rmse_ann]
lstm_metrics = [mae_lstm, mse_lstm, rmse_lstm]

# Plot
x = np.arange(len(metrics)) 
width = 0.2

fig, ax = plt.subplots(figsize=(10, 6))

rects1 = ax.bar(x - width, rnn_metrics, width, label='RNN')
rects2 = ax.bar(x, ann_metrics, width, label='ANN')
rects3 = ax.bar(x + width, lstm_metrics, width, label='LSTM')

ax.set_ylabel('Giá trị')
ax.set_title('So sánh mô hình RNN, ANN, LSTM')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

plt.show()

## Visualize predict with actual data

In [None]:
### RNN
import matplotlib.pyplot as plt
y_pred_rnn = model_rnn.predict(X_test_reshape)

plt.figure(figsize=(12, 6))
plt.plot(y_test.values[:200], label='Actual', color='blue')
plt.plot(y_pred_rnn[:200], label='RNN Predicted', linestyle='--', color='purple')
plt.title('Actual vs RNN dự đoán')
plt.xlabel('Mẫu')
plt.ylabel('CPU Sử dụng')
plt.legend()
plt.grid()
plt.show()


In [None]:
# ANN
y_pred_ann = model_ann.predict(X_test)

plt.figure(figsize=(12, 6))
plt.plot(y_test.values[:200], label='Actual', color='blue')
plt.plot(y_pred_ann[:200], label='ANN Predicted', linestyle='--', color='orange')
plt.title('Actual vs ANN dự đoán')
plt.xlabel('Mẫu')
plt.ylabel('CPU sử dụng')
plt.legend()
plt.grid()
plt.show()

In [None]:
## LSTM
y_pred_lstm = model_lstm.predict(X_test_reshape)

# Plot Actual vs Predicted (LSTM)
plt.figure(figsize=(12, 6))
plt.plot(y_test.values[:200], label='Actual', color='blue')
plt.plot(y_pred_lstm[:200], label='LSTM Predicted', linestyle='--', color='green')
plt.title('Actual vs LSTM dự đoán')
plt.xlabel('Mẫu')
plt.ylabel('CPU sử dụng')
plt.legend()
plt.grid()
plt.show()