In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv('day6_data1.csv')
data

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.20,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0
...,...,...,...,...
6492,11.2,1.6,3.27,1.0
6493,9.6,8.0,3.15,1.0
6494,9.4,1.2,2.99,1.0
6495,12.8,1.1,3.34,1.0


In [2]:
X = data[['alcohol','sugar','pH']].to_numpy()
Y = data['class'].to_numpy()

In [3]:
from sklearn.model_selection import train_test_split
t_x, tt_x, t_y, tt_y = train_test_split(X, Y, random_state = 42, test_size=0.2)

In [4]:
s_t_x, v_t_x, s_t_y, v_t_y = train_test_split(t_x, t_y, random_state = 42, test_size=0.2)

In [5]:
t_x.shape, tt_x.shape

((5197, 3), (1300, 3))

In [6]:
s_t_x.shape, v_t_x.shape

((4157, 3), (1040, 3))

In [7]:
print(f'학습data:{s_t_x.shape}\n테스트data:{v_t_x.shape}\n검증data:{tt_x.shape}')

학습data:(4157, 3)
테스트data:(1040, 3)
검증data:(1300, 3)


In [8]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(s_t_x,s_t_y)
dt.score(s_t_x,s_t_y), dt.score(v_t_x,v_t_y)

(0.9971133028626413, 0.864423076923077)

In [9]:
from sklearn.model_selection import cross_validate #cross_validate
sc = cross_validate(dt, t_x, t_y)
sc

{'fit_time': array([0.01296449, 0.00997186, 0.02194619, 0.01296282, 0.00797868]),
 'score_time': array([0.00199413, 0.00099969, 0.00199103, 0.00199533, 0.00099778]),
 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}

In [10]:
np.mean(sc['test_score'])

0.855300214703487

In [11]:
from sklearn.model_selection import StratifiedKFold #StratifiedKFold
sc1 = cross_validate(dt, t_x, t_y, cv = StratifiedKFold()) #cross_validate의 cv 기본값이 StratifiedKFold()임을 알 수 있다.
np.mean(sc1['test_score'])

0.855300214703487

In [12]:
sc_ck = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42 ) #StratifiedKFold 하이퍼파라미터값 설정
sc2 = cross_validate(dt, t_x, t_y, cv = sc_ck)
np.mean(sc2['test_score'])

0.8574181117533719

In [13]:
from sklearn.model_selection import GridSearchCV #GridSearchCV 최적의 파라미터값 찾기
params = {'min_impurity_decrease':[0.0001,0.0002,0.0003,0.0004,0.0005]}

In [14]:
gs = GridSearchCV(DecisionTreeClassifier(random_state = 42), params, n_jobs = -1)
gs.fit(t_x,t_y)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'min_impurity_decrease': [0.0001, 0.0002, 0.0003,
                                                   0.0004, 0.0005]})

In [15]:
dt = gs.best_estimator_
dt.score(t_x,t_y),dt.score(tt_x,tt_y)

(0.9615162593804117, 0.8653846153846154)

In [16]:
gs.best_params_

{'min_impurity_decrease': 0.0001}

In [17]:
gs.cv_results_['mean_test_score']

array([0.86819297, 0.86453617, 0.86492226, 0.86780891, 0.86761605])

In [18]:
gs.cv_results_ #'rank_test_score': array([1, 5, 4, 2, 3])} 첫번째 성능이 젤 좋음

{'mean_fit_time': array([0.01120462, 0.01101213, 0.01479597, 0.01199532, 0.01220427]),
 'std_fit_time': array([0.00341962, 0.00390052, 0.00369407, 0.00352048, 0.00231483]),
 'mean_score_time': array([0.00221438, 0.00210104, 0.00240865, 0.0017952 , 0.00180235]),
 'std_score_time': array([0.00148031, 0.0006697 , 0.00106294, 0.00074626, 0.00116538]),
 'param_min_impurity_decrease': masked_array(data=[0.0001, 0.0002, 0.0003, 0.0004, 0.0005],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'min_impurity_decrease': 0.0001},
  {'min_impurity_decrease': 0.0002},
  {'min_impurity_decrease': 0.0003},
  {'min_impurity_decrease': 0.0004},
  {'min_impurity_decrease': 0.0005}],
 'split0_test_score': array([0.86923077, 0.87115385, 0.86923077, 0.86923077, 0.86538462]),
 'split1_test_score': array([0.86826923, 0.86346154, 0.85961538, 0.86346154, 0.86923077]),
 'split2_test_score': array([0.8825794 , 0.87680462, 0.87584216, 0.88161

In [19]:
i = np.argmax(gs.cv_results_['mean_test_score'])
i #첫번째 값 (인덱스0)

0

In [20]:
gs.cv_results_['params'][i]

{'min_impurity_decrease': 0.0001}

In [21]:
params={'max_depth':range(5,20,1),
       'min_impurity_decrease':np.arange(0.0001,0.001,0.0001),
       'min_samples_split':range(2,100,10)
       }
gs1 = GridSearchCV(DecisionTreeClassifier(random_state = 42), params, n_jobs =-1)
gs1.fit(t_x, t_y)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 20),
                         'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009]),
                         'min_samples_split': range(2, 100, 10)})

In [22]:
gs1.best_params_

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}

In [23]:
np.max(gs1.cv_results_['mean_test_score'])

0.8683865773302731

랜덤 서치

In [24]:
from scipy.stats import uniform, randint

In [25]:
d = randint(0,10) #randint(0,n):0~n-1 랜덤정수
d.rvs(5) #rvs(n):n개만큼생성

array([2, 9, 4, 4, 1])

In [26]:
np.unique(d.rvs(1000),return_counts=True) #return_counts=True : 각 항목별 횟수
# 고유값, 각 항목별 횟수

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 98, 108, 111,  95, 122, 106,  92,  85,  77, 106], dtype=int64))

In [27]:
d = uniform(0,1)
d.rvs(5)

array([0.97385985, 0.56450952, 0.18453257, 0.29514296, 0.89274391])

In [28]:
params={'max_depth':randint(20,50),
       'min_impurity_decrease':uniform(0.0001,0.001),
       'min_samples_split':randint(2,25),
        'min_samples_leaf':randint(1,25)
       }

In [29]:
from sklearn.model_selection import RandomizedSearchCV #RandomizedSearchCV 랜덤 최적의 파라미터값 찾기
rs = RandomizedSearchCV(DecisionTreeClassifier(random_state = 42),params, n_iter = 100, 
                        n_jobs=-1,random_state = 42) #n_iter 그 개수만큼 설정
rs.fit(t_x,t_y)

RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000025DF51B6640>,
                                        'min_impurity_decrease': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000025DF6922E20>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000025DF6738760>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000025DF4150400>},
                   random_state=42)

In [30]:
rs.best_params_

{'max_depth': 39,
 'min_impurity_decrease': 0.00034102546602601173,
 'min_samples_leaf': 7,
 'min_samples_split': 13}