# 良/恶性乳腺肿瘤预测
<p>原始数据的下载地址为https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/
<p>数据特征如下：<p> 

Attribute| Domain
--- | ---
Sample code number| id number
Clump Thickness | 1 - 10
Uniformity of Cell Size | 1 - 10
Uniformity of Cell Shape | 1 - 10
Marginal Adhesion | 1 - 10
Single Epithelial Cell Size | 1 - 10
Bare Nuclei | 1 - 10
Bland Chromatin | 1 - 10
Normal Nucleoli | 1 - 10
Mitoses | 1 - 10
Class | (2 for benign, 4 for malignant)

In [1]:
# 首先导入数据并显示前五条数据
import pandas as pd
import numpy as np
column_names = ['number', 'Cl_Thickness', 'Unif_cell_size', 'Unid_cell_shape', 'Marg_Adhesion', 'Sing_epith_cell_size', 'Bare_nuclei', 'Bland_chromation', 'Norm_nucleoli', 'Mitoses', 'Class']
data = pd.read_csv('D:/DocumentFile/data/breast-cancer-wisconsin.data', names=column_names)
data.head()

Unnamed: 0,number,Cl_Thickness,Unif_cell_size,Unid_cell_shape,Marg_Adhesion,Sing_epith_cell_size,Bare_nuclei,Bland_chromation,Norm_nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [2]:
# 浏览数据的基本信息
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   number                699 non-null    int64 
 1   Cl_Thickness          699 non-null    int64 
 2   Unif_cell_size        699 non-null    int64 
 3   Unid_cell_shape       699 non-null    int64 
 4   Marg_Adhesion         699 non-null    int64 
 5   Sing_epith_cell_size  699 non-null    int64 
 6   Bare_nuclei           699 non-null    object
 7   Bland_chromation      699 non-null    int64 
 8   Norm_nucleoli         699 non-null    int64 
 9   Mitoses               699 non-null    int64 
 10  Class                 699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [3]:
# 查看数据的基本统计信息
data.describe()

Unnamed: 0,number,Cl_Thickness,Unif_cell_size,Unid_cell_shape,Marg_Adhesion,Sing_epith_cell_size,Bland_chromation,Norm_nucleoli,Mitoses,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [4]:
# 统计数据属性中的缺失值
data.isnull().sum()

number                  0
Cl_Thickness            0
Unif_cell_size          0
Unid_cell_shape         0
Marg_Adhesion           0
Sing_epith_cell_size    0
Bare_nuclei             0
Bland_chromation        0
Norm_nucleoli           0
Mitoses                 0
Class                   0
dtype: int64

In [5]:
# 如果存在缺失数据，需要填充或丢弃。该数据集包含了16个缺失值，用“？”标出。因此要删除有缺失值的数据
data = data.replace(to_replace='?', value=np.nan)
data = data.dropna(how='any')
print(data.shape)

(683, 11)


In [7]:
# 将数据划分为训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[column_names[1:10]], data[column_names[10]], test_size=0.25, random_state=33)
print('训练样本的数量和类别分布：\n', y_train.value_counts())

训练样本的数量和类别分布：
 2    344
4    168
Name: Class, dtype: int64


In [8]:
# 标准化数据，每个维度的特征数据均值为0，方差为1，使预测结果不会被某些维度过大的特征值主导
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [9]:
# 分别用LogisticRegression和SGDClassifier构造分类器
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
lr = LogisticRegression()
sgdc = SGDClassifier()
lr.fit(X_train, y_train)
lr_y_pred = lr.predict(X_test)
sgdc.fit(X_train, y_train)
sgdc_y_pred = sgdc.predict(X_test)

In [10]:
# 分析LR分类器性能
from sklearn.metrics import classification_report
print('Accuracy of LR Classifier:', lr.score(X_test, y_test))
print(classification_report(y_test, lr_y_pred, target_names=['Benign', 'Malignant']))

Accuracy of LR Classifier: 0.9883040935672515
              precision    recall  f1-score   support

      Benign       0.99      0.99      0.99       100
   Malignant       0.99      0.99      0.99        71

    accuracy                           0.99       171
   macro avg       0.99      0.99      0.99       171
weighted avg       0.99      0.99      0.99       171



In [11]:
# 分析SGD分类器性能
print('Accuracy of SGD Classifier:', sgdc.score(X_test, y_test))
print(classification_report(y_test, sgdc_y_pred, target_names=['Benign', 'Malignant']))

Accuracy of SGD Classifier: 0.9824561403508771
              precision    recall  f1-score   support

      Benign       1.00      0.97      0.98       100
   Malignant       0.96      1.00      0.98        71

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171

