# Model Tuning

유방암 데이터셋 사용  
악성 : M(malignant)  
양성 : B(benign)  

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
df = pd.read_csv(path, header=None)

In [3]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [4]:
X = df.iloc[:, 2:].values

In [7]:
y = np.where(y=='B', 0, 1)

이진분류  
positive class(1) : 좀 더 중요한(심각한) 클래스 따라서, 'M'  
negative class(0) : 'B'  

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state= 1, stratify= y)

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

In [12]:
from sklearn.pipeline import make_pipeline

In [15]:
pipeline = make_pipeline(StandardScaler(), 
                         PCA(n_components=2),
                         LogisticRegression(random_state=1))

In [16]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=2)),
                ('logisticregression', LogisticRegression(random_state=1))])

In [18]:
p = pipeline.predict(X_test)

In [19]:
pipeline.score(X_test, y_test)

0.956140350877193

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, p)

array([[71,  1],
       [ 4, 38]], dtype=int64)

In [21]:
from sklearn.model_selection import cross_val_score

In [24]:
scores = cross_val_score(estimator = pipeline, X=X_train, y=y_train, cv=10,n_jobs = -1 )

In [25]:
scores

array([0.93478261, 0.93478261, 0.95652174, 0.95652174, 0.93478261,
       0.95555556, 0.97777778, 0.93333333, 0.95555556, 0.95555556])

In [26]:
np.mean(scores)

0.9495169082125603

In [27]:
from sklearn.tree import DecisionTreeClassifier

In [36]:
best_score = 0
for k in range(1,11):
    scores = cross_val_score(estimator=DecisionTreeClassifier(max_depth=k,
                            random_state =1), X=X_train, y=y_train, cv=5, n_jobs=-1)
    print(k, np.mean(scores))
    if(np.mean(scores) > best_score) :
        best_score = np.mean(scores)
        best_parameter = {'max_depth':k}

1 0.8813186813186814
2 0.9318681318681319
3 0.9296703296703297
4 0.9406593406593406
5 0.9384615384615385
6 0.9362637362637363
7 0.9406593406593406
8 0.9406593406593406
9 0.9406593406593406
10 0.9406593406593406


In [37]:
best_score

0.9406593406593406

In [38]:
best_parameter

{'max_depth': 4}

In [39]:
best_tree = DecisionTreeClassifier(**best_parameter)

In [40]:
best_tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4)

In [41]:
best_tree.score(X_test, y_test)

0.9473684210526315

In [42]:
from sklearn.model_selection import GridSearchCV

In [44]:
my_grid = {'max_depth' : range(1,11)}

In [49]:
param_grid = {'max_depth': range(1, 10)}
gscv = GridSearchCV( estimator=DecisionTreeClassifier(random_state=1), param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)


In [None]:
from sklearn.svm import SVC

pipe_svc = make_pipeline(StandardScaler(), SVC(kernel='rbf', random_state=1))
p_list = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
param_grid = {'svc__C':p_list, 'svc__gamma':p_list}
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=5, n_)