In [1]:
import pandas as pd
import numpy as np

In [2]:
path="https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_name=['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']
data=pd.read_csv(path,names=column_name)

In [3]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
# 缺失值处理
data=data.replace(to_replace="?",value=np.nan)
data=data.dropna()

In [5]:
data.isnull().any()

Sample code number             False
Clump Thickness                False
Uniformity of Cell Size        False
Uniformity of Cell Shape       False
Marginal Adhesion              False
Single Epithelial Cell Size    False
Bare Nuclei                    False
Bland Chromatin                False
Normal Nucleoli                False
Mitoses                        False
Class                          False
dtype: bool

In [6]:
from sklearn.model_selection import train_test_split


In [7]:
x=data.iloc[:,1:-1]
y=data["Class"]

In [8]:
x.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [9]:
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int64

In [10]:
x_train,x_test,y_train,y_test=train_test_split(x,y)

In [11]:
# 特征工程-标准化

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
transfer=StandardScaler()
x_train=transfer.fit_transform(x_train)
x_test=transfer.transform(x_test)

In [14]:
from sklearn.linear_model import LogisticRegression


In [15]:
estimator=LogisticRegression()

In [16]:
estimator.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
# 逻辑回归的模型参数，回归系数和偏执
estimator.coef_

array([[1.11629982, 0.57716727, 1.23489964, 0.41059357, 0.31866507,
        1.39347644, 0.54165452, 0.75347031, 0.45606702]])

In [18]:
estimator.intercept_

array([-1.0982697])

In [19]:
# 5）模型评估
# 方法一:直接比对真实值和预测值
y_predict=estimator.predict(x_test)
print("y_predict:\n",y_predict)
print("比对结果：\n",y_test==y_predict)
# 方法二：计算准确率
score=estimator.score(x_test,y_test)
print("准确率：\n",score)


y_predict:
 [2 2 2 2 2 4 4 2 2 2 2 2 2 4 2 2 4 4 2 4 2 2 2 2 2 2 4 2 4 4 2 4 4 2 4 2 2
 2 2 2 4 2 4 2 4 4 4 2 4 2 4 2 4 2 4 4 4 2 4 2 4 2 2 2 2 2 4 2 2 4 2 2 4 4
 4 2 2 4 2 2 4 2 2 2 2 4 2 2 4 4 4 4 4 4 4 2 2 2 2 2 2 4 2 2 4 4 2 2 2 4 4
 4 2 4 2 4 2 4 2 4 4 4 2 2 2 4 4 2 4 2 4 4 2 4 2 2 2 2 4 2 4 4 4 2 4 2 2 2
 2 2 2 4 2 2 4 2 4 4 2 2 2 2 4 2 2 2 4 2 2 2 2]
比对结果：
 185     True
256     True
478     True
318     True
573     True
333     True
58      True
273    False
314     True
485     True
161     True
222    False
660     True
435     True
548     True
136     True
221     True
349     True
497     True
465     True
150     True
600     True
322     True
265     True
468     True
409     True
691     True
363     True
421     True
336     True
       ...  
296    False
366     True
193     True
239     True
375     True
291     True
645     True
576     True
507     True
535     True
586     True
178     True
498     True
530     True
476     True
213     True
124     True
8       T

In [20]:
from sklearn.metrics import classification_report

In [21]:
# 查看精确率，召回率，f1-score

In [22]:
report=classification_report(y_test,y_predict,labels=[2,4],target_names=["良性","恶性"])

In [24]:
print(report)

             precision    recall  f1-score   support

         良性       0.96      0.95      0.96       103
         恶性       0.93      0.94      0.93        68

avg / total       0.95      0.95      0.95       171



In [25]:
y_test.head()

185    2
256    2
478    2
318    2
573    2
Name: Class, dtype: int64

In [26]:
# 将y_teat转换成y_true
y_true=np.where(y_test > 3,1,0)

In [27]:
y_true

array([0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0])

In [28]:
from sklearn.metrics import roc_auc_score

In [30]:
roc_auc_score(y_true,y_predict)

0.9463163906339235