In [None]:
import pandas as pd
from pandas.core.frame import DataFrame
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

#* BUERABIBX
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn import neighbors

# MBER

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, classification_report

# DRBAX
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 指定使用的GPU编号
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at:', device_name)

In [None]:
# 读取数据
initial_data = pd.read_csv('K:\\Data\\MachineLearningCSV\\MachineLearningCVE\\total4.csv')
initial_data.head(n=5)

In [None]:
# 删除缺失值
data_to_use = initial_data.dropna()
data_to_use.shape

In [None]:
# 提取特征和标签
X = data_to_use.drop(axis=1, columns=['Label'])
y1 = data_to_use['Label'].values

In [None]:
# ADASYN上采样
adasyn = ADASYN()
X, y = adasyn.fit_resample(X, y)

In [None]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
# 确定数值型和分类型特征列
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns


In [None]:
# 特征预处理
t = [('ohe', OneHotEncoder(drop='first'), categorical_cols),
     ('scale', StandardScaler(), numerical_cols)]
col_trans = ColumnTransformer(transformers=t)
col_trans.fit(X)
X_train_transform = col_trans.transform(X_train)
X_test_transform = col_trans.transform(X_test)


In [None]:
# 标签编码
target_trans = LabelEncoder()
y1_train_transform = target_trans.fit_transform(y1_train)
y1_test_transform = target_trans.transform(y1_test)


In [None]:
# 训练随机森林分类器
clf = RandomForestClassifier(random_state=123, max_depth=10, min_samples_split=5, min_samples_leaf=2)
clf.fit(X=X_train_transform, y=y1_train_transform)


In [None]:
# 预测和评估
y_pred_class = clf.predict(X=X_test_transform)
y_pred_score = clf.predict_proba(X=X_test_transform)[:, 1]