In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pip install xgboost optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import joblib
import optuna


Import Dataset: Since the first column is ID, it is not included in the training data. The ID column will be removed from X as it is not meaningful for training the model.

In [4]:
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
X = dataset.iloc[:, 1:-1].values
Y = dataset.iloc[:, -1].values

Handle missing data: Use mean to fill in missing data

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:,1:47])
X[:, 1:47] = imputer.transform(X[:, 1:47])

Encoding the Dependent Variable: Convert data to numerical format because XGBoost works with numbers

In [6]:

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)


Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

Standardize Data: Normalize each feature to have the same scale

In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Hyperparameter tuning, this part is tuned on Kaggle https://www.kaggle.com/code/huyenngocnguyenuit/file-train-mont4

In [None]:
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameter space
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)

    # Define the classifier with suggested hyperparameters
    classifier = XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate
    )

    # Train the classifier
    classifier.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = classifier.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Create the Optuna study
study = optuna.create_study(direction='maximize')

# Optimize the study
study.optimize(objective, n_trials=50)

# Retrieve the best trial
best_trial = study.best_trial
print("Best Accuracy: {:.2f} %".format(best_trial.value * 100))
print("Best Parameters:", best_trial.params)

[I 2024-12-16 04:13:54,653] A new study created in memory with name: no-name-530e78f9-c9d9-45b5-888e-ff0b2002580e
Parameters: { "use_label_encoder" } are not used.



After finding the optimal parameter set on Kaggle with 50 trials, the optimal parameter set {'n_estimators': 182, 'max_depth': 10, 'learning_rate': 0.19879820388928054} was found with an accuracy of 94.14%. This parameter set is used for the model.

In [9]:
classifier = XGBClassifier()

optimized_params = {
    'n_estimators': 182,
    'max_depth': 10,
    'learning_rate': 0.19879820388928054
}

# Tạo classifier với tham số tối ưu
classifier = XGBClassifier(**optimized_params)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)




Decode numerical labels back to text format for the classification report

In [12]:
#print(f"Accuracy: {accuracy}")

# Generate and print classification report
y_test_original = le.inverse_transform(y_test)
y_pred_original = le.inverse_transform(y_pred)

report = classification_report(y_test_original, y_pred_original, digits=4)
print("Classification Report:")
print(report)


Classification Report:
                         precision    recall  f1-score   support

       Backdoor_Malware     0.4836    0.2269    0.3088       454
          BenignTraffic     0.8139    0.8775    0.8445     16749
       BrowserHijacking     0.8516    0.5381    0.6595       853
       CommandInjection     0.5638    0.3723    0.4484       736
 DDoS-ACK_Fragmentation     0.9990    0.9976    0.9983     16610
        DDoS-HTTP_Flood     0.9859    0.9808    0.9833      4059
        DDoS-ICMP_Flood     0.9999    0.9997    0.9998     16732
DDoS-ICMP_Fragmentation     0.9983    0.9978    0.9981     16964
      DDoS-PSHACK_Flood     1.0000    1.0000    1.0000     16815
       DDoS-RSTFINFlood     1.0000    0.9998    0.9999     16866
         DDoS-SYN_Flood     0.9452    0.9788    0.9617     16788
         DDoS-SlowLoris     0.9785    0.9901    0.9843      3349
DDoS-SynonymousIP_Flood     0.9894    0.9718    0.9805     16733
         DDoS-TCP_Flood     0.9630    0.9851    0.9739     16773
 

Export model

In [11]:
joblib.dump(classifier, 'optimized_model.pkl')
classifier.save_model('model.json')