In [2]:
import os
import random
import timeit
import pandas as pd
import numpy as np
import tempfile
import tensorflow as tf
import xgboost as xgb
import matplotlib.pyplot as plt
import tensorflow_model_optimization as tfmot
import keras.models as k_models

from xgboost import XGBClassifier
from xgboost import plot_importance
from scipy.sparse import csr_matrix, save_npz
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import tf_keras as keras
from tf_keras import activations
from tf_keras.models import Model, Sequential, load_model
from tf_keras.layers import Dense, Input, LSTM
from tf_keras.callbacks import EarlyStopping





In [3]:
# XGBoost

In [4]:
df_A = pd.read_csv('Final_EVSE_A.csv')
df_B = pd.read_csv('Final_EVSE_B.csv')

def prepare_categorical_output(y):
    # # Print the `y` matrix before encoding
    # print("y matrix before encoding:\n", y)

    # Convert Series to NumPy array and reshape `y` matrix
    y = y.values.reshape(-1, 1)

    # Print the `y` matrix after reshaping
    # print("y matrix after reshaping:\n", y)
    
    # One-hot encode the target variable
    encoder = OneHotEncoder(sparse_output=False)
    y = encoder.fit_transform(y)

    # Print the `y` matrix after one-hot encoding
    # print("y matrix after one-hot encoding:\n", y)
    
    return y

#Considering B charging station as training and A as testing

def assigning_set(df1, df2):
    # Group by 'CSVNameFile' and split the last 20% of each group into the validation set
    train_list = []
    val_list = []

    grouped = df1.groupby('CSVNameFile')

    for _, group in grouped:
        split_index = int(len(group) * 0.8)
        train_list.append(group.iloc[:split_index])
        val_list.append(group.iloc[split_index:])

    # Concatenate the training and validation sets
    train_df = pd.concat(train_list).reset_index(drop=True)
    val_df = pd.concat(val_list).reset_index(drop=True)

    # Separate features and labels for train and validation sets
    X_train = train_df.drop(columns=['CSVNameFile', 'status', 'multiclass'])
    y_train = prepare_categorical_output(train_df['multiclass'])

    X_val = val_df.drop(columns=['CSVNameFile', 'status', 'multiclass'])
    y_val = prepare_categorical_output(val_df['multiclass'])

    # X_test and y_test from df2 remain unchanged for test evaluation
    X_test = df2.drop(columns=['CSVNameFile', 'status', 'multiclass'])
    y_test = prepare_categorical_output(df2['multiclass'])

    input_dim = X_train.shape[1]
    output_dim = len(np.unique(df1['multiclass']))

    return X_train, X_val, X_test, y_train, y_val, y_test, input_dim, output_dim

X_train, X_val, X_test, y_train, y_val, y_test, input_dim, output_dim = assigning_set(df_B, df_A)

In [5]:
original_xgb_model = xgb.XGBClassifier()
original_xgb_model.load_model('xgboost_optimization.json')

In [10]:
file_path = 'xgboost_optimization_optuna_results.csv'
optuna_results = pd.read_csv(file_path)

# Find the row with the highest `value` (best performance)
best_trial = optuna_results.loc[optuna_results['value'].idxmax()]

# Extract the parameters from the best trial
best_params = {
    'colsample_bytree': best_trial['params_colsample_bytree'],
    'gamma': best_trial['params_gamma'],
    'learning_rate': best_trial['params_learning_rate'],
    'max_depth': int(best_trial['params_max_depth']),
    'min_child_weight': int(best_trial['params_min_child_weight']),
    'n_estimators': int(best_trial['params_n_estimators']),
    'reg_alpha': best_trial['params_reg_alpha'],
    'reg_lambda': best_trial['params_reg_lambda'],
    'subsample': best_trial['params_subsample']
}

print(best_params)


{'colsample_bytree': 0.6744991072269428, 'gamma': 0.8603613221867412, 'learning_rate': 0.0735622218696743, 'max_depth': 5, 'min_child_weight': 9, 'n_estimators': 500, 'reg_alpha': 8.09074732075009, 'reg_lambda': 7.670181467583394, 'subsample': 0.6451802043106292}


In [12]:
# Get feature importances from the model
importance = original_xgb_model.feature_importances_

# Create a DataFrame for feature importances with feature names and their scores
importance_df = pd.DataFrame({
    'Feature': original_xgb_model.get_booster().feature_names,
    'Importance': importance
})

# Sort by importance and calculate the cumulative sum
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df['Cumulative Importance'] = importance_df['Importance'].cumsum()

# Display the top 10 features with cumulative importance
print(importance_df.head(10))


                       Feature  Importance  Cumulative Importance
25      src2dst_stddev_piat_ms    0.670139               0.670139
6              src2dst_packets    0.106949               0.777087
20   bidirectional_min_piat_ms    0.087958               0.865045
21  bidirectional_mean_piat_ms    0.052028               0.917073
4        bidirectional_packets    0.021417               0.938490
3    bidirectional_duration_ms    0.020947               0.959437
23   bidirectional_max_piat_ms    0.011444               0.970881
42         dst2src_ack_packets    0.007685               0.978566
43         dst2src_rst_packets    0.004037               0.982602
8              dst2src_packets    0.002253               0.984856


In [14]:
# Select the top 10 features
top_10_features = importance_df.head(10)['Feature'].tolist()
print("Top 10 Features:", top_10_features)

# Filter X data to include only the top 10 features
X_train_top10 = X_train[top_10_features]
X_val_top10 = X_val[top_10_features]
X_test_top10 = X_test[top_10_features]


Top 10 Features: ['src2dst_stddev_piat_ms', 'src2dst_packets', 'bidirectional_min_piat_ms', 'bidirectional_mean_piat_ms', 'bidirectional_packets', 'bidirectional_duration_ms', 'bidirectional_max_piat_ms', 'dst2src_ack_packets', 'dst2src_rst_packets', 'dst2src_packets']


In [16]:
# Model Pruning without FS

In [18]:
# Train a pruned model

# Additional parameters that are not part of the hyperparameters optimized by Optuna
additional_params = {
    'objective': 'multi:softmax',
    'num_class': output_dim,
    'early_stopping_rounds' : 10,
    'random_state': 42
}

# Combine best parameters with additional ones
final_params = {**best_params, **additional_params}

# Set a new value for the number of estimators and max depth
new_n_estimators = 100
new_max_depth = 2

# Update final_params with the new number of estimators
adjusted_params = {**final_params, 'n_estimators': new_n_estimators, 'max_depth': new_max_depth}

# Initialize the model with the combined parameters
pruned_xgb_model = xgb.XGBClassifier(**adjusted_params)

y_train_class = np.argmax(y_train, axis=1)
y_val_class = np.argmax(y_val, axis=1)

# Train the pruned model
pruned_xgb_model.fit(X_train, y_train_class, eval_set=[(X_val, y_val_class)], verbose = False)

In [19]:
# Model Pruning with FS

In [22]:
# Initialize the model
pruned_FS_xgb_model = xgb.XGBClassifier(**adjusted_params)

# Train the pruned_FS model
pruned_FS_xgb_model.fit(X_train_top10, y_train_class, eval_set=[(X_val_top10, y_val_class)], verbose = False)

In [24]:
# Evaluation metrics

# Make predictions
def Evaluation(model, model_name, X_test, y_test):
    y_pred = model.predict(X_test)
    
    # Convert y_test to class labels
    y_test_classes = np.argmax(y_test, axis=1)
    
    # Evaluation
    accuracy = accuracy_score(y_test_classes, y_pred)
    precision = precision_score(y_test_classes, y_pred, average='weighted')
    recall = recall_score(y_test_classes, y_pred, average='weighted')
    f1 = f1_score(y_test_classes, y_pred, average='weighted')

    print(f'Evaluation metrics of {model_name}')
    
    print(f'Test Accuracy: {accuracy:.4f}')
    print(f'Test Precision: {precision:.4f}')
    print(f'Test Recall: {recall:.4f}')
    print(f'Test F1 Score: {f1:.4f}')

Evaluation(original_xgb_model, 'original_xgb_model', X_test, y_test)

Evaluation metrics of original_xgb_model
Test Accuracy: 0.9850
Test Precision: 0.9865
Test Recall: 0.9850
Test F1 Score: 0.9854


In [26]:
Evaluation(pruned_xgb_model, 'pruned_xgb_model',X_test , y_test)

Evaluation metrics of pruned_xgb_model
Test Accuracy: 0.9815
Test Precision: 0.9845
Test Recall: 0.9815
Test F1 Score: 0.9823


In [28]:
Evaluation(pruned_FS_xgb_model, 'pruned_FS_xgb_model',X_test_top10 , y_test)

Evaluation metrics of pruned_FS_xgb_model
Test Accuracy: 0.9815
Test Precision: 0.9844
Test Recall: 0.9815
Test F1 Score: 0.9823


In [30]:
# Save the best pruned_xgb_model in XGBoost format
pruned_xgb_model.save_model('pruned_xgb_model.json')
pruned_FS_xgb_model.save_model('pruned_FS_xgb_model.json')

In [32]:
# Comparison of inference time 

def measure_inference_time(model, pruned_model, pruned_FS_model, X_test, X_test_pruned):
    
    # Lists to store inference times for both models
    model_time_list = []
    pruned_model_time_list = []
    pruned_model_FS_time_list = []

    # Model before pruning (original model)
    for sample in X_test:
        start_time = timeit.default_timer()
        model.predict(np.expand_dims(sample, axis=0))
        end_time = timeit.default_timer()
        model_time_list.append((end_time - start_time) * 1000)  # Convert to milliseconds
    
    avg_inference_time_model = np.mean(model_time_list)
    print(f"Average inference time per sample for Model before pruning: {avg_inference_time_model:.6f} milliseconds")

    # Model after pruning (pruned model)
    for sample in X_test:
        start_time = timeit.default_timer()
        pruned_model.predict(np.expand_dims(sample, axis=0))
        end_time = timeit.default_timer()
        pruned_model_time_list.append((end_time - start_time) * 1000)  # Convert to milliseconds
    
    avg_inference_time_pruned_model = np.mean(pruned_model_time_list)
    print(f"Average inference time per sample for Model after pruning: {avg_inference_time_pruned_model:.6f} milliseconds")


    # Model after pruning and FS (FS pruned model)
    for sample in X_test_pruned:
        start_time = timeit.default_timer()
        pruned_FS_model.predict(np.expand_dims(sample, axis=0))
        end_time = timeit.default_timer()
        pruned_model_FS_time_list.append((end_time - start_time) * 1000)  # Convert to milliseconds
    
    avg_inference_time_pruned_model_FS = np.mean(pruned_model_FS_time_list)
    print(f"Average inference time per sample for Model after pruning and FS: {avg_inference_time_pruned_model_FS:.6f} milliseconds")

    
    # Save the lists as NumPy arrays
    np.save('model_time_list.npy', np.array(model_time_list))
    np.save('pruned_model_time_list.npy', np.array(pruned_model_time_list))
    np.save('pruned_FS_model_time_list.npy', np.array(pruned_model_FS_time_list))
    
    return avg_inference_time_model, avg_inference_time_pruned_model, avg_inference_time_pruned_model_FS


X_test_array = X_test.to_numpy(dtype=np.float32)
X_test_top10_array = X_test_top10.to_numpy(dtype=np.float32)
avg_time_model, avg_time_pruned_model, avg_time_pruned_FS_model = measure_inference_time(original_xgb_model, pruned_xgb_model, pruned_FS_xgb_model, X_test_array, X_test_top10_array)

Average inference time per sample for Model before pruning: 0.202661 milliseconds
Average inference time per sample for Model after pruning: 0.198487 milliseconds
Average inference time per sample for Model after pruning and FS: 0.196298 milliseconds


In [34]:
# Get the files size in bytes

original_model_size = os.path.getsize('xgboost_optimization.json')
pruned_model_size = os.path.getsize('pruned_xgb_model.json')
pruned_model_FS_size = os.path.getsize('pruned_FS_xgb_model.json')

# Convert to kilobytes
original_model_size_kb = original_model_size / 1024  # Size in KB
pruned_model_size_kb = pruned_model_size / 1024  # Size in KB
pruned_FS_model_size_kb = pruned_model_FS_size / 1024  # Size in KB


print('original_model_size_kb: ', original_model_size_kb)
print('pruned_model_size_kb:', pruned_model_size_kb)
print('pruned_FS_model_size_kb:', pruned_FS_model_size_kb)



original_model_size_kb:  670.56640625
pruned_model_size_kb: 219.6318359375
pruned_FS_model_size_kb: 216.7158203125
