In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder


In [2]:
#mount google drive

from google.colab import drive

drive.mount('/content/drive')

path = "/content/drive/MyDrive/"


Mounted at /content/drive


In [3]:
combinedDf = pd.read_csv(path + 'updated_Combined.csv')

combinedDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215890 entries, 0 to 1215889
Data columns (total 24 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Unnamed: 0   1215890 non-null  float64
 1   Seq          1215890 non-null  float64
 2   Dur          1215890 non-null  float64
 3   RunTime      1215890 non-null  float64
 4   Mean         1215890 non-null  float64
 5   Sum          1215890 non-null  float64
 6   Min          1215890 non-null  float64
 7   Max          1215890 non-null  float64
 8   TotPkts      1215890 non-null  float64
 9   SrcPkts      1215890 non-null  float64
 10  DstPkts      1215890 non-null  float64
 11  TotBytes     1215890 non-null  float64
 12  SrcBytes     1215890 non-null  float64
 13  DstBytes     1215890 non-null  float64
 14  Offset       1215890 non-null  float64
 15  sMeanPktSz   1215890 non-null  float64
 16  dMeanPktSz   1215890 non-null  float64
 17  Load         1215890 non-null  float64
 18  Sr

In [4]:
features = ['Seq', 'Dur', 'RunTime', 'Mean', 'Sum', 'Min', 'Max',
            'TotPkts', 'SrcPkts', 'DstPkts', 'TotBytes', 'SrcBytes',
            'DstBytes', 'Offset', 'sMeanPktSz', 'dMeanPktSz',
            'Load', 'SrcLoad', 'Rate', 'SrcRate']
target = 'Label'

In [5]:
# Load the trained models from files

models_path = path+'models/'

dt_model = joblib.load(models_path + 'dt_model.pkl')
logistic_model = joblib.load(models_path + 'logistic_model.pkl')
mlp_model = joblib.load(models_path + 'mlp_model.pkl')
rf_model = joblib.load(models_path + 'rf_model.pkl')
knn_model = joblib.load(models_path + 'knn_model.pkl')
gbm_model = joblib.load(models_path + 'gbm.pkl')


In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combinedDf[features], combinedDf[target], test_size=0.2, random_state=42)

In [7]:
# Define the ensemble model for max voting
ensemble_model = VotingClassifier(estimators=[
    ('Decision_tree', dt_model),
    ('logistic_regression', logistic_model),
    ('MultiLayerPerceptron', mlp_model),
    ('RandomForestClassifier', rf_model),
    ('KNN classifier', knn_model),
    ('GBM model', gbm_model)
], voting='soft')

In [8]:
# Train the ensemble model
ensemble_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Make predictions using the ensemble model
ensemble_pred = ensemble_model.predict(X_test)

In [10]:
# Evaluate model performance
accuracy = accuracy_score(y_test, ensemble_pred)
precision_benign = precision_score(y_test, ensemble_pred, pos_label='Benign')
recall_benign = recall_score(y_test, ensemble_pred, pos_label='Benign')
f1_benign = f1_score(y_test, ensemble_pred, pos_label='Benign')
precision_malicious = precision_score(y_test, ensemble_pred, pos_label='Malicious')
recall_malicious = recall_score(y_test, ensemble_pred, pos_label='Malicious')
f1_malicious = f1_score(y_test, ensemble_pred, pos_label='Malicious')

# Print evaluation metrics
print("Accuracy:", accuracy)
print("precision_benign:", precision_benign)
print("recall_benign:", recall_benign)
print("f1_benign:", f1_benign)
print("precision_malicious:", precision_benign)
print("recall_malicious:", recall_benign)
print("f1_malicious:", f1_malicious)

Accuracy: 0.9991364350393539
precision_benign: 0.9993104601111622
recall_benign: 0.998496774328246
f1_benign: 0.9989034515168921
precision_malicious: 0.9993104601111622
recall_malicious: 0.998496774328246
f1_malicious: 0.9992877637817708
