In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression  # 逻辑回归 API
from sklearn.metrics import classification_report, roc_auc_score
from sklearnex import patch_sklearn

In [2]:
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
# 1.获取数据
names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv('data/breast-cancer-wisconsin.data', names=names)

In [4]:
# 2.基本数据处理
# 2.1 缺失值处理
data = data.replace('?', np.NaN)
data = data.dropna()
# 2.2 确定特征值,目标值
x = data.iloc[:, 1:-1]
y = data['Class']
# 2.3 分割数据
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22, test_size=0.2)

In [5]:
# 3.特征工程(标准化)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.fit_transform(x_test)

In [6]:
# 4.机器学习(逻辑回归)
estimator = LogisticRegression()  # 直接调用逻辑回归API
estimator.fit(x_train, y_train)

In [7]:
# 5.模型评估
# 5.1 基本评估
y_predict = estimator.predict(x_test)
print("预测值为:\n", y_predict)

score = estimator.score(x_test, y_test)
print("准确率为:\n", score)

预测值为:
 [2 4 4 2 2 2 2 2 2 2 2 2 2 4 2 2 4 4 4 2 4 2 4 4 4 2 4 2 2 2 2 2 4 2 2 2 4
 2 2 2 2 4 2 4 4 4 4 2 4 4 2 2 2 2 2 4 2 2 2 2 4 4 4 4 2 4 2 2 4 2 2 2 2 4
 2 2 2 2 2 2 4 4 4 2 4 4 4 4 2 2 2 4 2 4 2 2 2 2 2 2 4 2 2 4 2 2 4 2 4 4 2
 2 2 2 4 2 2 2 2 2 2 4 2 4 2 2 2 4 2 4 2 2 2 4 2 2 2]
准确率为:
 0.9854014598540146


In [14]:
# 5.2 分类评估
ret = classification_report(y_test, y_predict, target_names=('良性', '恶性'))
print(ret)

              precision    recall  f1-score   support

          良性       0.99      0.99      0.99        89
          恶性       0.98      0.98      0.98        48

    accuracy                           0.99       137
   macro avg       0.98      0.98      0.98       137
weighted avg       0.99      0.99      0.99       137



In [16]:
# 5.3 AUC 指标计算
# AUC的概率意义是随机取一对正负样本，正样本得分大于负样本的概率
# AUC的最小值为 0.5，最大值为 1，取值越高越好
# AUC=1，完美分类器，采用这个预测模型时，不管设定什么阈值都能得出完美预测。绝大多数预测的场合，不存在完美分类器。
# 0.5<AUC<1，优于随机猜测。这个分类器（模型）妥善设定阈值的话，能有预测价值。
# 最终AUC的范围在[0.5, 1]之间，并且越接近1越好
# AUC只能用来评价二分类
# AUC非常适合评价样本不平衡中的分类器性能
y_test = np.where(y_test > 3 , 1 ,0)
print("AUC指标：", roc_auc_score(y_test, y_predict))

AUC指标： 0.9839653558052435
