In [40]:
import pandas as pd
import numpy as np

In [41]:
path="https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_name=['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']
data=pd.read_csv(path,names=column_name)

In [42]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [43]:
# 缺失值处理
data=data.replace(to_replace="?",value=np.nan)
data=data.dropna()

In [44]:
data.isnull().any()

Sample code number             False
Clump Thickness                False
Uniformity of Cell Size        False
Uniformity of Cell Shape       False
Marginal Adhesion              False
Single Epithelial Cell Size    False
Bare Nuclei                    False
Bland Chromatin                False
Normal Nucleoli                False
Mitoses                        False
Class                          False
dtype: bool

In [45]:
from sklearn.model_selection import train_test_split


In [46]:
x=data.iloc[:,1:-1]
y=data["Class"]

In [47]:
x.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [48]:
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int64

In [49]:
x_train,x_test,y_train,y_test=train_test_split(x,y)

In [50]:
# 特征工程-标准化

In [51]:
from sklearn.preprocessing import StandardScaler

In [53]:
transfer=StandardScaler()
x_train=transfer.fit_transform(x_train)
x_test=transfer.transform(x_test)

In [54]:
from sklearn.linear_model import LogisticRegression


In [55]:
estimator=LogisticRegression()

In [56]:
estimator.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [57]:
# 逻辑回归的模型参数，回归系数和偏执
estimator.coef_

array([[1.42702133, 0.77558572, 0.68408071, 0.84204088, 0.26151419,
        1.52581056, 0.84179973, 0.79726793, 0.91256315]])

In [58]:
estimator.intercept_

array([-0.5193368])

In [59]:
# 5）模型评估
# 方法一:直接比对真实值和预测值
y_predict=estimator.predict(x_test)
print("y_predict:\n",y_predict)
print("比对结果：\n",y_test==y_predict)
# 方法二：计算准确率
score=estimator.score(x_test,y_test)
print("准确率：\n",score)


y_predict:
 [2 2 2 4 2 2 4 2 4 4 2 2 4 4 4 2 4 4 4 2 2 2 4 2 2 2 4 2 4 2 2 2 2 4 4 2 2
 4 2 2 4 2 2 2 2 2 2 2 2 2 2 2 4 2 4 2 2 2 4 2 2 2 4 4 4 2 4 4 4 4 2 2 2 2
 2 2 4 2 2 2 4 2 4 2 4 4 2 4 2 4 4 2 2 2 4 4 2 2 2 2 4 2 4 2 2 2 4 2 2 2 4
 2 2 4 4 2 4 2 2 2 2 2 2 4 2 4 4 2 2 2 2 2 2 4 4 2 4 2 2 2 4 4 2 2 2 2 2 2
 2 4 2 2 4 2 2 2 2 2 2 2 2 2 4 4 2 2 2 4 2 2 2]
比对结果：
 392     True
486     True
34      True
330     True
176     True
180     True
46      True
274     True
84      True
223     True
516     True
618     True
267     True
611     True
20      True
419     True
126     True
574     True
340     True
439     True
428     True
407     True
436     True
69      True
511     True
137     True
54      True
31      True
188     True
415     True
       ...  
251     True
687     True
454     True
501     True
125     True
27      True
401     True
424     True
205     True
387     True
138     True
252    False
684     True
61      True
10      True
616     True
114     True
554     T