### 교차 검증 단순화
- scikit-learn의 model-selection 모듈 내에 모델 검증 관련 기능 활용
- 교차 검증 데이터 기반 검증 결과 처리

[1] 모듈 로딩 및 데이터 준비 <hr>


In [45]:
# 모듈 로딩
import pandas as pd

# 생선 데이터 준비
fishDF = pd.read_csv('../data/fish.csv')

# 붓꽃 데이터 준비
irisDF = pd.read_csv('../data/iris.csv')

In [46]:
fishDF.head(2)

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056


In [47]:
irisDF.head(2)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa


[2] 데이터 준비 => feature & target 분리 <hr>

In [48]:
# 타겟 : Weight / 피처 : 나머지 컬럼 
fishTargetSR = fishDF[fishDF.columns[1]]
fishFeatureDF = fishDF[fishDF.columns[2:]]

fishTargetSR.head(2), fishFeatureDF.head(2)

(0    242.0
 1    290.0
 Name: Weight, dtype: float64,
    Length  Diagonal  Height   Width
 0    25.4      30.0   11.52  4.0200
 1    26.3      31.2   12.48  4.3056)

In [49]:
# 타겟 : variety / 피처 : 나머지 컬럼 
irisTargetSR = irisDF[irisDF.columns[-1]]
irisFeatureDF = irisDF[irisDF.columns[:-1]]

irisTargetSR.head(2), irisFeatureDF.head(2)

(0    Setosa
 1    Setosa
 Name: variety, dtype: object,
    sepal.length  sepal.width  petal.length  petal.width
 0           5.1          3.5           1.4          0.2
 1           4.9          3.0           1.4          0.2)

[3] 데이터 전처리 <hr>
- 피처 스케일링

In [50]:
# 학습용 -- 테스트용 데이터 분리
from sklearn.model_selection import train_test_split

# 생선데이터 => 학습용 데이터셋, 테스트용 데이터셋 ===> 회귀
fXtrn, fXtst, fytrn, fytst = train_test_split(fishFeatureDF, 
                                              fishTargetSR, 
                                              test_size=0.2, 
                                              random_state=5) 

# 붓꽃데이터 => 학습용 데이터셋, 테스트용 데이터셋 ===> 분류
irsXtrn, irsXtst, irsytrn, irsytst = train_test_split(irisFeatureDF, 
                                                      irisTargetSR, 
                                                      test_size=0.2, 
                                                      random_state=5, 
                                                      stratify=irisTargetSR)

In [51]:
#
from sklearn.preprocessing import StandardScaler

fshSclr = StandardScaler()
fshSclr.fit(fXtrn)
scaled_fish_Xtrn = fshSclr.transform(fXtrn)
scaled_fish_Xtst = fshSclr.transform(fXtst)

irsSclr = StandardScaler()
irsSclr.fit(irsXtrn)
scaled_irs_Xtrn = irsSclr.transform(irsXtrn)
scaled_irs_Xtst = irsSclr.transform(irsXtst)

[4] 학습<hr>
[4-1] 생선 무게 예측 모델

In [52]:
# 모듈 로딩 
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict

In [53]:
# 교차검증으로 학습 진행
# => 준비 : 모델 인스턴스, 학습용 데이터 

lr_mdl = LinearRegression()

In [54]:
# 학습/검증에 대한 평가 모든 결과에 대한 처리.
result = cross_validate(lr_mdl, 
                        scaled_fish_Xtrn, 
                        fytrn, 
                        scoring=('r2', 'neg_mean_squared_error'),
                        return_train_score=True, 
                        return_estimator=True) # cv = 3 : 삼등분
print(result)

{'fit_time': array([0.00802541, 0.00342035, 0.        , 0.        , 0.        ]), 'score_time': array([0.        , 0.00049925, 0.        , 0.00514221, 0.        ]), 'estimator': [LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression()], 'test_r2': array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905]), 'train_r2': array([0.87426416, 0.88779401, 0.88061108, 0.90297504, 0.89833592]), 'test_neg_mean_squared_error': array([ -8767.84902315, -17815.75093903, -12344.87825138, -22006.47049028,
       -39450.52608702]), 'train_neg_mean_squared_error': array([-16078.44783606, -13972.57866943, -15268.42472495, -13223.98109532,
       -10586.01039978])}


In [55]:
resultDF = pd.DataFrame(result)
resultDF

Unnamed: 0,fit_time,score_time,estimator,test_r2,train_r2,test_neg_mean_squared_error,train_neg_mean_squared_error
0,0.008025,0.0,LinearRegression(),0.921047,0.874264,-8767.849023,-16078.447836
1,0.00342,0.000499,LinearRegression(),0.843854,0.887794,-17815.750939,-13972.578669
2,0.0,0.0,LinearRegression(),0.885924,0.880611,-12344.878251,-15268.424725
3,0.0,0.005142,LinearRegression(),0.64672,0.902975,-22006.47049,-13223.981095
4,0.0,0.0,LinearRegression(),0.790319,0.898336,-39450.526087,-10586.0104


In [56]:
result.keys()

dict_keys(['fit_time', 'score_time', 'estimator', 'test_r2', 'train_r2', 'test_neg_mean_squared_error', 'train_neg_mean_squared_error'])

In [57]:
result.values()

dict_values([array([0.00802541, 0.00342035, 0.        , 0.        , 0.        ]), array([0.        , 0.00049925, 0.        , 0.00514221, 0.        ]), [LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression()], array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905]), array([0.87426416, 0.88779401, 0.88061108, 0.90297504, 0.89833592]), array([ -8767.84902315, -17815.75093903, -12344.87825138, -22006.47049028,
       -39450.52608702]), array([-16078.44783606, -13972.57866943, -15268.42472495, -13223.98109532,
       -10586.01039978])])

In [58]:
best_mdl = resultDF.iloc[0]['estimator']
best_mdl.coef_, best_mdl.intercept_

(array([ 373.98470744, -159.77931033,   90.53431501,   50.22123874]),
 408.52250924970195)

In [59]:
### CV에 score만 추출 
cross_val_score(lr_mdl, scaled_fish_Xtrn, fytrn)

array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905])

In [60]:
## CV에 predict만 추출 
cross_val_predict(lr_mdl, scaled_fish_Xtrn, fytrn)

array([ 9.09792517e+01,  9.85612151e+01,  3.87029719e+02,  1.13011547e+02,
        6.81676563e+02,  2.82456988e+02,  5.34379642e+02,  3.61848302e+02,
        6.12934598e+02,  1.70756130e+02,  5.53222970e+02,  1.69433076e+01,
       -2.53895688e+01,  8.14926155e+02,  6.97225129e+01,  3.38157931e+02,
        4.76306355e+02,  7.67659158e+02,  6.55686457e+02,  1.80300946e+02,
        8.45315559e+02,  2.92145322e+02,  6.08539351e+02,  9.02782406e+02,
        6.99788981e+02,  9.40316876e+02,  7.47628344e+02,  3.28419355e+02,
        7.89622699e+02,  9.09130831e+02, -1.98986854e+02,  1.81089559e+02,
        6.36731679e+02, -1.09209894e+02,  3.57087822e+02,  7.88250361e+02,
        3.25180589e+02,  6.56473977e+02, -2.37032025e+02,  4.55882834e+01,
        9.57130255e+01, -2.10830505e+02,  1.28969696e+02, -2.21199132e+02,
       -1.10282630e+02,  6.39911566e+02,  2.12288357e+02,  2.41098815e+02,
        2.61932359e+02, -2.58301758e+02,  2.93250859e+01,  8.87950700e+02,
        2.46460034e+02,  

### 교차검증과 튜닝까지 한꺼번에 진행
- 단점 : 시간이 오래 걸림

In [62]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

In [76]:
# 학습모델 인스턴스와 하이퍼 파라미터
# est =LogisticRegression(max_iter=10000, solver='liblinear')
# params= {'penalty' :['l1,', 'l2'], 'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}

In [85]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
#%%
from sklearn.linear_model import LogisticRegression
params  = {'penalty': ['l1', 'l2']}
est = LogisticRegression(max_iter=1000, solver='liblinear')

gscv = GridSearchCV(est, param_grid=params, return_train_score=True)
gscv.fit(scaled_irs_Xtrn, irsytrn)
#%%
cv_resultsDF = pd.DataFrame(gscv.cv_results_)
#%%
cv_resultsDF

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.001812,0.002636,0.000205,0.00041,l1,{'penalty': 'l1'},0.875,1.0,0.958333,0.958333,...,0.941667,0.042492,1,0.9375,0.927083,0.9375,0.947917,0.9375,0.9375,0.006588
1,0.0,0.0,0.003201,0.00392,l2,{'penalty': 'l2'},0.875,0.958333,0.958333,0.958333,...,0.916667,0.052705,2,0.90625,0.916667,0.916667,0.927083,0.90625,0.914583,0.007795


In [86]:
gscv = GridSearchCV(est, param_grid=params, return_train_score=True)

gscv.fit(scaled_irs_Xtrn, irsytrn)

In [87]:
cv_resultsDF = pd.DataFrame(gscv.cv_results_)
cv_resultsDF

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.001072,0.001162,0.000913,0.001375,l1,{'penalty': 'l1'},0.875,1.0,0.958333,0.958333,...,0.941667,0.042492,1,0.9375,0.927083,0.9375,0.947917,0.9375,0.9375,0.006588
1,0.001181,0.002362,0.000187,0.000374,l2,{'penalty': 'l2'},0.875,0.958333,0.958333,0.958333,...,0.916667,0.052705,2,0.90625,0.916667,0.916667,0.927083,0.90625,0.914583,0.007795


In [88]:
gscv.best_params_, gscv.best_index_, gscv.best_score_

({'penalty': 'l1'}, 0, 0.9416666666666668)

In [89]:
## 데이터에 적합한 모델 찾기
from sklearn.utils.discovery import all_estimators 

In [90]:
mdls = all_estimators('classifier')
for mdl_name, mdl in mdls :
    try :
        print(mdl().fit(scaled_irs_Xtrn, irsytrn))
    except Exception as e:
        print(e)

AdaBoostClassifier()
BaggingClassifier()
BernoulliNB()
CalibratedClassifierCV()
Negative values in data passed to CategoricalNB (input X)
__init__() missing 1 required positional argument: 'base_estimator'
Negative values in data passed to ComplementNB (input X)
DecisionTreeClassifier()
DummyClassifier()
ExtraTreeClassifier()
ExtraTreesClassifier()
GaussianNB()
GaussianProcessClassifier()
GradientBoostingClassifier()
HistGradientBoostingClassifier()
KNeighborsClassifier()
LabelPropagation()
LabelSpreading()
LinearDiscriminantAnalysis()
LinearSVC()
LogisticRegression()




LogisticRegressionCV()
MLPClassifier()
__init__() missing 1 required positional argument: 'estimator'
Negative values in data passed to MultinomialNB (input X)
NearestCentroid()
NuSVC()
__init__() missing 1 required positional argument: 'estimator'
__init__() missing 1 required positional argument: 'estimator'
__init__() missing 1 required positional argument: 'estimator'
PassiveAggressiveClassifier()
Perceptron()
QuadraticDiscriminantAnalysis()
RadiusNeighborsClassifier()
RandomForestClassifier()
RidgeClassifier()
RidgeClassifierCV()
SGDClassifier()
SVC()
__init__() missing 1 required positional argument: 'estimators'
__init__() missing 1 required positional argument: 'estimators'


