In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

file_path = "BCCC-SCsVul-2024.csv"
df = pd.read_csv(file_path)

df = df.drop(columns=['ID'], errors='ignore')

target_columns = [col for col in df.columns if "Class" in col]
df['Target'] = df[target_columns].idxmax(axis=1)
df = df.drop(columns=target_columns)

le = LabelEncoder()
df['Target'] = le.fit_transform(df['Target'])

df = df.apply(pd.to_numeric, errors='coerce')

X = df.drop(columns=['Target'])
y = df['Target']

X = X.fillna(X.mean())

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
adaboost = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.05, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, random_state=42)

ensemble_model = VotingClassifier(estimators=[('rf', rf), ('adaboost', adaboost), ('gb', gb)], voting='hard')

ensemble_model.fit(X_train, y_train)

y_pred = ensemble_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Ensemble Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




Ensemble Model Accuracy: 0.9892761394101877
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       743
           1       0.97      1.00      0.98      1384
           2       0.92      1.00      0.96      1039
           3       1.00      0.53      0.69       511
           4       0.89      1.00      0.94       732
           5       1.00      1.00      1.00       606
           6       1.00      1.00      1.00       376
           7       1.00      1.00      1.00      2253
           8       1.00      1.00      1.00      2509
           9       1.00      1.00      1.00      3384
          10       1.00      1.00      1.00      3568
          11       1.00      1.00      1.00      5275

    accuracy                           0.99     22380
   macro avg       0.98      0.96      0.96     22380
weighted avg       0.99      0.99      0.99     22380

Confusion Matrix:
 [[ 743    0    0    0    0    0    0    0    0