In [2]:
import pandas as pd
import numpy as np

In [3]:
#1 Read data
path = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv(path, names = column_name)
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
#2 Nan data processing
#1) np.nan
data = data.replace(to_replace = "?", value = np.nan)
#2) Drop missing values
data.dropna(inplace = True)
data.isnull().any() #No more missing values

Sample code number             False
Clump Thickness                False
Uniformity of Cell Size        False
Uniformity of Cell Shape       False
Marginal Adhesion              False
Single Epithelial Cell Size    False
Bare Nuclei                    False
Bland Chromatin                False
Normal Nucleoli                False
Mitoses                        False
Class                          False
dtype: bool

In [5]:
#3 Train test split
from sklearn.model_selection import train_test_split

In [6]:
# Filter out features and target
x = data.iloc[:, 1:-1]
y = data["Class"]
x.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [7]:
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int64

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [9]:
x_train

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
540,5,1,1,1,2,2,2,1,1
462,6,1,1,3,2,1,1,1,1
447,5,1,1,1,2,1,1,1,1
396,3,1,1,1,2,1,3,1,1
199,3,1,1,1,2,1,2,1,1
...,...,...,...,...,...,...,...,...,...
508,5,1,1,1,2,1,1,1,1
351,2,1,1,1,2,1,3,1,1
412,10,10,10,6,8,4,8,5,1
143,1,1,1,1,2,5,1,1,1


In [10]:
#4 Feature engineering
from sklearn.preprocessing import StandardScaler
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [11]:
x_train

array([[ 0.16768374, -0.72614171, -0.75784001, ..., -0.62026314,
        -0.64064096, -0.36453784],
       [ 0.52099267, -0.72614171, -0.75784001, ..., -1.0197282 ,
        -0.64064096, -0.36453784],
       [ 0.16768374, -0.72614171, -0.75784001, ..., -1.0197282 ,
        -0.64064096, -0.36453784],
       ...,
       [ 1.93422843,  2.20385917,  2.22179033, ...,  1.77652725,
         0.64064096, -0.36453784],
       [-1.24555202, -0.72614171, -0.75784001, ..., -1.0197282 ,
        -0.64064096, -0.36453784],
       [-0.53893414, -0.72614171, -0.75784001, ..., -1.0197282 ,
        -0.64064096, -0.36453784]])

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
# 5、Estimator
estimator = LogisticRegression()
estimator.fit(x_train, y_train)

LogisticRegression()

In [14]:
#Logistic regression settings: coefficients and intercepts
estimator.coef_

array([[1.31857307, 0.36864535, 0.67699784, 0.67166891, 0.35363119,
        1.27904165, 1.00374191, 0.45359577, 0.57554105]])

In [15]:
estimator.intercept_

array([-1.13720189])

In [16]:
# 6、Model assessment
# Method 1：Compare predicted and real value
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("Comparing predicted and real value:\n", y_test == y_predict)

# Method2：Accuracy
score = estimator.score(x_test, y_test)
print("Accuracy：\n", score)

y_predict:
 [2 2 4 4 4 2 2 2 4 2 2 2 2 2 2 2 2 4 2 2 2 4 4 4 2 2 4 2 4 2 2 2 4 2 4 2 2
 4 2 2 2 2 4 2 4 2 4 2 2 2 2 2 2 2 4 2 4 4 2 2 4 2 2 4 2 2 2 2 4 4 2 2 2 2
 2 2 2 4 4 2 4 2 2 2 2 4 4 2 4 2 4 4 4 2 2 2 4 2 4 2 2 2 2 2 2 2 2 4 2 2 4
 4 2 2 2 2 4 2 2 2 2 2 2 2 2 2 2 4 4 4 2 2 2 2 2 2 4 2 4 2 2 4 4 2 2 2 2 4
 2 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 2 2 2 2]
Compare prediction and real value:
 533    True
458    True
167    True
483    True
270    True
       ... 
487    True
397    True
375    True
601    True
410    True
Name: Class, Length: 171, dtype: bool
The accuracy is：
 0.9707602339181286


In [20]:
from sklearn.metrics import classification_report

In [21]:
report = classification_report(y_test, y_predict, labels = [2, 4], target_names = ['benign', 'malignant'])

In [22]:
print(report)

              precision    recall  f1-score   support

      benign       0.96      1.00      0.98       116
   malignant       1.00      0.91      0.95        55

    accuracy                           0.97       171
   macro avg       0.98      0.95      0.97       171
weighted avg       0.97      0.97      0.97       171



In [23]:
y_test.head()

533    2
458    2
167    4
483    4
270    4
Name: Class, dtype: int64

In [24]:
# Let y_true indicate the actual type of each sample, only in terms of 0(negative) and 1(positive)
# change y_test into 0 and 1
y_true = np.where(y_test > 3, 1, 0)

In [25]:
y_true

array([0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0])

In [26]:
from sklearn.metrics import roc_auc_score

In [27]:
roc_auc_score

<function sklearn.metrics._ranking.roc_auc_score(y_true, y_score, *, average='macro', sample_weight=None, max_fpr=None, multi_class='raise', labels=None)>

In [28]:
roc_auc_score(y_true, y_predict)

0.9545454545454546