### Grid Search Cross Validation

Grid Search Cross Validation은 사용자 정의 점수 함수에 따라 교차 검증 성과를 최대화하는 파라미터 조합을 완전 탐색을 통해 수행한다. 기저 데이터 구조에 대해 잘 모를 경우 가장 먼저 취할 수 있는 합리적인 방법이다. `scikit-learn`은 이 방법을 `GridSearchCV` 함수로 구현했는데 교차 검증 생성자를 인수로 받는다. Chapter 7에서 설명했던 이유로 `GridSearchCV`가 누출된 정보로 Machine Learning 추정기를 과적합하는 것을 방지하려면 `PurgedKFold` Class를 전달해야 한다.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def get_test_data(
        n_features : int = 40,
        n_informative : int = 10,
        n_redundant : int = 10,
        n_samples : int = 10000,
        random_state : int = 42,
        sigma_std : float = 0.0
) : 
    from sklearn.datasets import make_classification
    
    np.random.seed(random_state)
    
    trnsX, cont = make_classification(
        n_samples = n_samples,
        n_features = n_features,
        n_informative = n_informative,
        n_redundant = n_redundant,
        random_state = random_state,
        shuffle = False
    )
    
    df0_index = pd.date_range(start = pd.to_datetime('today') - pd.to_timedelta(n_samples, unit = 'd'), 
                              periods = n_samples, freq = 'B')

    trnsX, cont = pd.DataFrame(trnsX, index = df0_index), pd.Series(cont, index=df0_index).to_frame('bin')

    df0 = ['I_' + str(i) for i in range(n_informative)] + ['R_' + str(i) for i in range(n_redundant)]
    df0 += ['N_' + str(i) for i in range(n_features - len(df0))]
    
    trnsX.columns = df0
    cont['w'] = 1. / cont.shape[0]
    cont['t1'] = pd.Series(cont.index, index = cont.index)
    
    return trnsX, cont

testing = False
n_samples = 1000
n_splits = 3

trnsX, cont = get_test_data(
    n_features = 10,
    n_informative = 5,
    n_redundant = 0,
    n_samples = n_samples,
)

In [3]:
cont

Unnamed: 0,bin,w,t1
2021-11-11 23:33:55.150398,0,0.001,2021-11-11 23:33:55.150398
2021-11-12 23:33:55.150398,0,0.001,2021-11-12 23:33:55.150398
2021-11-15 23:33:55.150398,0,0.001,2021-11-15 23:33:55.150398
2021-11-16 23:33:55.150398,0,0.001,2021-11-16 23:33:55.150398
2021-11-17 23:33:55.150398,0,0.001,2021-11-17 23:33:55.150398
...,...,...,...
2025-09-04 23:33:55.150398,1,0.001,2025-09-04 23:33:55.150398
2025-09-05 23:33:55.150398,1,0.001,2025-09-05 23:33:55.150398
2025-09-08 23:33:55.150398,1,0.001,2025-09-08 23:33:55.150398
2025-09-09 23:33:55.150398,1,0.001,2025-09-09 23:33:55.150398


In [13]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from FinancialMachineLearning.cross_validation.cross_validation import PurgedKFold, clf_hyper_fit

params_grid = {
    'C' : [1e-2, 1e-1, 1, 10, 100],
    'gamma' : [1e-2, 1e-1, 1, 10, 100]
}

pipe_classification = SVC(probability = True)

inner_cv = PurgedKFold(
    n_splits = n_splits,
    samples_info_sets = cont.index.to_series()
)

grid_search = GridSearchCV(
    estimator = pipe_classification,
    param_grid = params_grid,
    scoring = 'neg_log_loss',
    cv = inner_cv,
    n_jobs = 8,
    return_train_score = True
)

In [14]:
grid_search = grid_search.fit(X = trnsX, y = cont['bin'])
gs_result = pd.DataFrame(grid_search.cv_results_)

In [16]:
best_estimator = grid_search.best_estimator_
print(best_estimator)

SVC(C=1, gamma=10, probability=True)


In [18]:
from sklearn.ensemble import BaggingClassifier

pipe_classification = BaggingClassifier()

clf_hyper_fit(
    feat = trnsX, 
    label = cont['bin'], 
    samples_info_sets = cont.index.to_series(),
    pipe_clf = pipe_classification,
    param_grid = params_grid,
    cv = 3,
    bagging = [0, 1],
    random_search_iterator = 0,
    n_jobs = 8,
    pct_embargo = 0.01,
)

AttributeError: 'SVC' object has no attribute 'steps'