# 乳癌資料庫預測SVM分類
>使用scikit-learn 機器學習套件裡的SVR演算法

* (一)引入函式庫及內建乳癌資料集<br>
引入之函式庫如下<br>
sklearn.datasets: 用來匯入內建之乳癌資料集`datasets.load_breast_cancer()`<br>
sklearn.SVR: 支持向量機回歸分析之演算法<br>
matplotlib.pyplot: 用來繪製影像

In [35]:
from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

## Step1. 下載資料

In [36]:
breast_cancer=datasets.load_breast_cancer()
print(breast_cancer.feature_names)
print(breast_cancer.target_names)


['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
['malignant' 'benign']


In [37]:
df = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)
df['target'] = breast_cancer.target
X = breast_cancer.data
Y = breast_cancer.target
print(df.head(2))

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38           122.8     1001.0          0.11840   
1        20.57         17.77           132.9     1326.0          0.08474   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0                 0.07871  ...          17.33            184.6      2019.0   
1                 0.05667  ...          23.41            158.8      1956.0   

   worst smoothness  worst compactness  worst concavity  worst concave points  \
0            0.1622             0.6656           0.7119                0.2654   
1            0.1238             0.1866           0.2416                0.1860   

   worst symmetry  worst fractal dimension  target  
0          0.

In [38]:
# Compute correlation with target
corr = df.corr()["target"].abs().sort_values(ascending=False)

# Show top 10 features (excluding target itself)
top10 = corr.drop("target").head(10)
print(top10)

worst concave points    0.793566
worst perimeter         0.782914
mean concave points     0.776614
worst radius            0.776454
mean perimeter          0.742636
worst area              0.733825
mean radius             0.730029
mean area               0.708984
mean concavity          0.696360
worst concavity         0.659610
Name: target, dtype: float64


In [39]:
X_top10 = df[top10.index]
X_top10.head()

Unnamed: 0,worst concave points,worst perimeter,mean concave points,worst radius,mean perimeter,worst area,mean radius,mean area,mean concavity,worst concavity
0,0.2654,184.6,0.1471,25.38,122.8,2019.0,17.99,1001.0,0.3001,0.7119
1,0.186,158.8,0.07017,24.99,132.9,1956.0,20.57,1326.0,0.0869,0.2416
2,0.243,152.5,0.1279,23.57,130.0,1709.0,19.69,1203.0,0.1974,0.4504
3,0.2575,98.87,0.1052,14.91,77.58,567.7,11.42,386.1,0.2414,0.6869
4,0.1625,152.2,0.1043,22.54,135.1,1575.0,20.29,1297.0,0.198,0.4


## Step2. 區分訓練集與測試集

In [40]:
x_train, x_test, y_train, y_test = train_test_split(X_top10, Y, test_size=0.3, random_state=0)

## Step3. 建模

In [41]:
rbf_clf = svm.SVC() # Defualt is rbf.
rbf_clf.fit(x_train, y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [42]:
linear_clf = svm.SVC(kernel='linear')
linear_clf.fit(x_train, y_train)

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [43]:
poly_clf = svm.SVC(kernel='poly')
poly_clf.fit(x_train, y_train)

0,1,2
,C,1.0
,kernel,'poly'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


## Step4. 預測

```

```


In [44]:
import numpy as np

def compare_predictions(y_test, y_pred, show_details=True):
    wrong_mask = (y_pred != y_test)
    
    wrong_count = wrong_mask.sum()
    total = len(y_test)
    wrong_percentage = wrong_count / total * 100
    wrong_indices = np.where(wrong_mask)[0]
    
    print(f"Number of wrong predictions: {wrong_count} / {total}")
    print(f"Wrong percentage: {wrong_percentage:.2f}%")
    
    if show_details and wrong_count > 0:
        for idx in wrong_indices:
            print(f"Index {idx}: true={y_test[idx]}, pred={y_pred[idx]}")
    
    return wrong_count, wrong_percentage, wrong_indices


In [45]:
# Ground truth.
y_test

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1])

In [46]:
rbf_y_test = rbf_clf.predict(x_test)
compare_predictions(rbf_y_test, y_test)

Number of wrong predictions: 12 / 171
Wrong percentage: 7.02%
Index 0: true=1, pred=0
Index 13: true=0, pred=1
Index 27: true=1, pred=0
Index 40: true=1, pred=0
Index 51: true=1, pred=0
Index 60: true=1, pred=0
Index 73: true=1, pred=0
Index 93: true=1, pred=0
Index 122: true=1, pred=0
Index 155: true=1, pred=0
Index 157: true=1, pred=0
Index 166: true=1, pred=0


(np.int64(12),
 np.float64(7.017543859649122),
 array([  0,  13,  27,  40,  51,  60,  73,  93, 122, 155, 157, 166]))

In [47]:
linear_y_test = linear_clf.predict(x_test)
compare_predictions(linear_y_test, y_test)

Number of wrong predictions: 9 / 171
Wrong percentage: 5.26%
Index 10: true=0, pred=1
Index 40: true=1, pred=0
Index 44: true=0, pred=1
Index 70: true=0, pred=1
Index 73: true=1, pred=0
Index 97: true=0, pred=1
Index 109: true=1, pred=0
Index 126: true=0, pred=1
Index 151: true=0, pred=1


(np.int64(9),
 np.float64(5.263157894736842),
 array([ 10,  40,  44,  70,  73,  97, 109, 126, 151]))

In [48]:
poly_y_test = poly_clf.predict(x_test)
compare_predictions(poly_y_test, y_test)

Number of wrong predictions: 14 / 171
Wrong percentage: 8.19%
Index 0: true=1, pred=0
Index 13: true=0, pred=1
Index 20: true=1, pred=0
Index 27: true=1, pred=0
Index 35: true=1, pred=0
Index 40: true=1, pred=0
Index 51: true=1, pred=0
Index 60: true=1, pred=0
Index 73: true=1, pred=0
Index 93: true=1, pred=0
Index 122: true=1, pred=0
Index 155: true=1, pred=0
Index 157: true=1, pred=0
Index 166: true=1, pred=0


(np.int64(14),
 np.float64(8.187134502923977),
 array([  0,  13,  20,  27,  35,  40,  51,  60,  73,  93, 122, 155, 157,
        166]))

## Step5. 準確度分析

In [49]:
print(rbf_clf.score(x_train,y_train))
print(rbf_clf.score(x_test, y_test))

0.9045226130653267
0.9298245614035088


In [50]:
# Linear kernel has the best accuracy.
print(linear_clf.score(x_train,y_train))
print(linear_clf.score(x_test, y_test))

0.9597989949748744
0.9473684210526315


In [51]:
print(poly_clf.score(x_train,y_train))
print(poly_clf.score(x_test, y_test))

0.9020100502512562
0.9181286549707602
