In [1]:
import pandas as pd
import numpy as np

In [2]:
path = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']

data = pd.read_csv(path, names=column_name)

In [3]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
# 2、to deal with the missing value
# 1）replace it with -》np.nan
data = data.replace(to_replace="?", value=np.nan)
# 2）delete the sample which has missing value
data.dropna(inplace=True)

In [5]:
data.isnull().any() # check whether delete all the sample with the missing value

Sample code number             False
Clump Thickness                False
Uniformity of Cell Size        False
Uniformity of Cell Shape       False
Marginal Adhesion              False
Single Epithelial Cell Size    False
Bare Nuclei                    False
Bland Chromatin                False
Normal Nucleoli                False
Mitoses                        False
Class                          False
dtype: bool

In [6]:
# 3、splite the data set
from sklearn.model_selection import train_test_split

In [7]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [8]:
# filter the feature value and target value
x = data.iloc[:, 1:-1]
y = data["Class"]

In [9]:
x.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [10]:
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int64

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [12]:
x_train.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
232,8,4,6,3,3,1,4,3,1
26,3,2,1,1,1,1,2,1,1
81,4,1,1,2,2,1,2,1,1
371,1,1,3,1,2,1,1,1,1
629,4,1,1,1,2,1,1,1,1


In [13]:
# 4、standard
from sklearn.preprocessing import StandardScaler

In [14]:
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [15]:
x_train

array([[ 1.2485121 ,  0.28600744,  0.99331671, ...,  0.25365334,
         0.04415384, -0.34067952],
       [-0.51344544, -0.37361332, -0.73654096, ..., -0.5815268 ,
        -0.61111481, -0.34067952],
       [-0.16105393, -0.70342371, -0.73654096, ..., -0.5815268 ,
        -0.61111481, -0.34067952],
       ...,
       [-0.16105393, -0.70342371, -0.73654096, ..., -0.99911687,
        -0.61111481, -0.34067952],
       [ 1.95329511,  1.27543859,  1.33928825, ...,  0.25365334,
        -0.61111481,  0.26708325],
       [-1.21822845, -0.70342371, -0.04459789, ..., -0.5815268 ,
        -0.61111481, -0.34067952]])

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
# 5、the process of estimator
estimator = LogisticRegression()
estimator.fit(x_train, y_train)

LogisticRegression()

In [18]:
estimator.coef_

array([[1.23377236, 0.46270365, 0.49421866, 0.67664482, 0.30866921,
        1.13827929, 0.97864448, 0.65348103, 0.59782425]])

In [19]:
estimator.intercept_

array([-1.19554989])

In [20]:
# 6、model accessment
# method1：compare the real target number with pridiction number
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("Compare the real value with the prediction value:\n", y_test == y_predict)

# method2：get the accuracy
score = estimator.score(x_test, y_test)
print("The accuracy is ：\n", score)

y_predict:
 [2 2 2 4 2 2 2 2 2 4 2 2 4 2 4 2 4 4 4 2 2 2 2 4 2 2 4 4 2 2 4 2 2 4 4 4 2
 4 4 2 2 2 2 2 2 2 4 2 2 4 2 2 2 2 4 2 2 2 2 2 4 2 2 4 2 2 2 4 2 2 4 4 4 2
 4 4 2 4 2 2 2 2 2 4 2 2 2 2 4 2 2 2 4 4 4 2 4 2 4 2 2 4 2 4 2 2 2 4 2 4 2
 2 4 2 4 2 2 2 4 4 2 2 2 2 4 2 4 4 4 2 2 2 2 4 2 2 4 4 2 2 2 2 2 2 4 2 2 4
 2 2 4 2 2 2 2 2 4 2 2 4 4 2 2 4 2 2 4 2 4 4 2]
Compare the real value with the prediction value:
 280    True
499    True
596    True
636    True
579    True
       ... 
449    True
156    True
680    True
691    True
618    True
Name: Class, Length: 171, dtype: bool
The accuracy is ：
 0.9766081871345029


In [21]:
# check the accuracy, recall,F1-score
from sklearn.metrics import classification_report

In [22]:
report = classification_report(y_test, y_predict, labels=[2, 4], target_names=["benign tumor", "malignant tumor"])

In [23]:
print(report)

                 precision    recall  f1-score   support

   benign tumor       0.97      0.99      0.98       109
malignant tumor       0.98      0.95      0.97        62

       accuracy                           0.98       171
      macro avg       0.98      0.97      0.97       171
   weighted avg       0.98      0.98      0.98       171



In [24]:
y_test.head()

280    2
499    2
596    2
636    4
579    2
Name: Class, dtype: int64

In [25]:
# y_true：y_true is the real type which needs to be converted to 0(negative)/1(positive)
# make y_test be converted to convert 0(negative)/1(positive)
y_true = np.where(y_test > 3, 1, 0)

In [26]:
y_true

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0])

In [27]:
from sklearn.metrics import roc_auc_score

In [28]:
roc_auc_score(y_true, y_predict)

0.9712192956496005