# 使用Breast Cancer Wisconsin (Diagnostic) 数据集进行二分类

知识点：
- 逻辑回归
- 数据归一化


In [3]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

使用来自sklearn的数据集：Breast Cancer Wisconsin (Diagnostic) 数据集

In [4]:
# 加载数据集
data = load_breast_cancer()
X = data.data
y = data.target
# 查看数据形状
print(X.shape, y.shape)

(569, 30) (569,)


数据预处理：包括归一化和划分训练集与测试集

In [5]:
# 数据归一化：使用Z-score标准化
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
X_normalized = (X - X_mean) / X_std
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

实现逻辑回归模型，使用自定义的类和方法

In [6]:
# 逻辑回归模型，不使用自带的库
class MyLogisticRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            # 计算逻辑回归的梯度
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # 更新权重和偏置
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)

训练模型，并进行预测和评估

In [7]:
# 训练模型
model = MyLogisticRegression(learning_rate=0.01, n_iterations=1000)
model.fit(X_train, y_train)
# 预测
predictions = model.predict(X_test)
# 计算准确率
accuracy = np.sum(predictions == y_test) / len(y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")
# 输出分类报告
print(classification_report(y_test, predictions, target_names=data.target_names))

Accuracy: 99.12%
              precision    recall  f1-score   support

   malignant       1.00      0.98      0.99        43
      benign       0.99      1.00      0.99        71

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

