### 교차 검증 단순화

-   scikit-learn의 model_selection 모듈 내에 모델 검증관련 기능 활용
-   교차 검증 데이터기반 검증 결과 처리


[1] 모듈 로딩 및 데이터 준비 <hr>


In [2]:
# 모듈 로딩
import pandas as pd

# 생선 데이터 준비
fishDF = pd.read_csv("../data/fish.csv")

# 붓꽃 데이터 준비
irisDF = pd.read_csv("../data/iris.csv")


In [3]:
fishDF.head(2)


Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056


In [4]:
irisDF.head(2)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


[2] 데이터 준비 => 피쳐 & 타겟 분리 <hr>


In [6]:
# 타겟 : Weight, 피쳐 : Length	Diagonal	Height	Width
fish_targetSR = fishDF[fishDF.columns[1]]
fish_featureDF = fishDF[fishDF.columns[2:]]

fish_targetSR.head(2), fish_featureDF.head(2)


(0    242.0
 1    290.0
 Name: Weight, dtype: float64,
    Length  Diagonal  Height   Width
 0    25.4      30.0   11.52  4.0200
 1    26.3      31.2   12.48  4.3056)

In [7]:
# 타겟 : 4번 컬럼, 피쳐 : 0~3번 컬럼
iris_targetSR = irisDF[irisDF.columns[-1]]
iris_featureDF = irisDF[irisDF.columns[:-1]]

iris_targetSR.head(2), iris_featureDF.head(2)


(0    setosa
 1    setosa
 Name: species, dtype: object,
    sepal_length  sepal_width  petal_length  petal_width
 0           5.1          3.5           1.4          0.2
 1           4.9          3.0           1.4          0.2)

[3] 데이터 전처리 <hr>

-   피쳐 스케일링


In [8]:
# 학습용 테스트용 데이터 분리
from sklearn.model_selection import train_test_split

# 생선 데이터 => 학습용 데이터셋, 테스트용 데이터셋 ==> 회귀
fish_x_train, fish_x_test, fish_y_train, fish_y_test = train_test_split(
    fish_featureDF, fish_targetSR, test_size=0.2, random_state=5
)

# 붓꽃 데이터 => 학습용 데이터셋, 테스트용 데이터셋 ==> 분류
iris_x_train, iris_x_test, iris_y_train, iris_y_test = train_test_split(
    iris_featureDF, iris_targetSR, test_size=0.2, random_state=5, stratify=iris_targetSR
)


In [9]:
from sklearn.preprocessing import StandardScaler

# 생선 데이터셋 피쳐 스케일링
fish_scaler = StandardScaler()
fish_scaler.fit(fish_x_train)

scaled_fish_x_train = fish_scaler.transform(fish_x_train)
scaled_fish_x_test = fish_scaler.transform(fish_x_test)


In [10]:
# 붓꽃 데이터셋 피쳐 스케일링
iris_scaler = StandardScaler()
iris_scaler.fit(iris_x_train)

scaled_iris_x_train = iris_scaler.transform(iris_x_train)
scaled_iris_x_test = iris_scaler.transform(iris_x_test)


[4] 학습 <hr>
[4-1] 생선 무게 예측 모델


In [12]:
# 모듈 로딩
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict


In [13]:
# 교차 검증으로 학습 진행
# => 준비 : 모델 인스턴스, 학습용 피쳐 데이터, 학습용 라벨 데이터
lr_model = LinearRegression()


In [27]:
# 학습/검증에 대한 평가 모든 결과 처리
result = cross_validate(
    lr_model,
    scaled_fish_x_train,
    fish_y_train,
    scoring=("r2", "neg_mean_squared_error"),
    return_train_score=True,
    return_estimator=True,
)
result


{'fit_time': array([0.        , 0.0010047 , 0.00116491, 0.        , 0.01198673]),
 'score_time': array([0.        , 0.00199938, 0.        , 0.        , 0.        ]),
 'estimator': [LinearRegression(),
  LinearRegression(),
  LinearRegression(),
  LinearRegression(),
  LinearRegression()],
 'test_r2': array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905]),
 'train_r2': array([0.87426416, 0.88779401, 0.88061108, 0.90297504, 0.89833592]),
 'test_neg_mean_squared_error': array([ -8767.84902315, -17815.75093903, -12344.87825138, -22006.47049028,
        -39450.52608702]),
 'train_neg_mean_squared_error': array([-16078.44783606, -13972.57866943, -15268.42472495, -13223.98109532,
        -10586.01039978])}

In [24]:
resultDF = pd.DataFrame(result)
resultDF


Unnamed: 0,fit_time,score_time,estimator,test_r2,train_r2,test_neg_mean_squared_error,train_neg_mean_squared_error
0,0.00174,0.001021,LinearRegression(),0.921047,0.874264,-8767.849023,-16078.447836
1,0.0,0.0,LinearRegression(),0.843854,0.887794,-17815.750939,-13972.578669
2,0.0,0.0,LinearRegression(),0.885924,0.880611,-12344.878251,-15268.424725
3,0.0,0.0,LinearRegression(),0.64672,0.902975,-22006.47049,-13223.981095
4,0.0,0.012065,LinearRegression(),0.790319,0.898336,-39450.526087,-10586.0104


In [25]:
best_model = resultDF.iloc[0]["estimator"]
best_model.coef_, best_model.intercept_


(array([ 373.98470744, -159.77931033,   90.53431501,   50.22123874]),
 408.52250924970195)

In [28]:
### CV의 score만 추출
cross_val_score(lr_model, scaled_fish_x_train, fish_y_train)


array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905])

In [36]:
## CV의 predict만 추출
cross_val_predict(lr_model, scaled_fish_x_train, fish_y_train)


array([ 9.09792517e+01,  9.85612151e+01,  3.87029719e+02,  1.13011547e+02,
        6.81676563e+02,  2.82456988e+02,  5.34379642e+02,  3.61848302e+02,
        6.12934598e+02,  1.70756130e+02,  5.53222970e+02,  1.69433076e+01,
       -2.53895688e+01,  8.14926155e+02,  6.97225129e+01,  3.38157931e+02,
        4.76306355e+02,  7.67659158e+02,  6.55686457e+02,  1.80300946e+02,
        8.45315559e+02,  2.92145322e+02,  6.08539351e+02,  9.02782406e+02,
        6.99788981e+02,  9.40316876e+02,  7.47628344e+02,  3.28419355e+02,
        7.89622699e+02,  9.09130831e+02, -1.98986854e+02,  1.81089559e+02,
        6.36731679e+02, -1.09209894e+02,  3.57087822e+02,  7.88250361e+02,
        3.25180589e+02,  6.56473977e+02, -2.37032025e+02,  4.55882834e+01,
        9.57130255e+01, -2.10830505e+02,  1.28969696e+02, -2.21199132e+02,
       -1.10282630e+02,  6.39911566e+02,  2.12288357e+02,  2.41098815e+02,
        2.61932359e+02, -2.58301758e+02,  2.93250859e+01,  8.87950700e+02,
        2.46460034e+02,  

### 교차검증과 튜닝까지 한꺼번에 진행

-   단점 : 시간이 오래 걸림


In [40]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression


In [50]:
# 학습 모델 인스턴스와 하이퍼 파라미터
est = LogisticRegression(max_iter=10000, solver="liblinear")
params = {
    "penalty": ["l1", "l2"],
}


In [51]:
gscv = GridSearchCV(est, param_grid=params, return_train_score=True)
gscv.fit(scaled_iris_x_train, iris_y_train)


In [54]:
pd.DataFrame(gscv.cv_results_).T


Unnamed: 0,0,1
mean_fit_time,0.002547,0.004212
std_fit_time,0.000765,0.005234
mean_score_time,0.001224,0.000435
std_score_time,0.000729,0.00087
param_penalty,l1,l2
params,{'penalty': 'l1'},{'penalty': 'l2'}
split0_test_score,0.875,0.875
split1_test_score,1.0,0.958333
split2_test_score,0.958333,0.958333
split3_test_score,0.958333,0.958333


In [57]:
gscv.best_params_, gscv.best_index_, gscv.best_score_


({'penalty': 'l1'}, 0, 0.9416666666666668)

### 데이터에 적합한 모델 찾기


In [58]:
from sklearn.utils.discovery import all_estimators


In [63]:
models = all_estimators("classifier")
for model_name, model in models:
    try:
        print(model().fit(scaled_iris_x_train, iris_y_train))
    except Exception as e:
        print("\033[31m", e, "\033[0m")


AdaBoostClassifier()
BaggingClassifier()
BernoulliNB()
CalibratedClassifierCV()
[31m Negative values in data passed to CategoricalNB (input X) [0m
[31m __init__() missing 1 required positional argument: 'base_estimator' [0m
[31m Negative values in data passed to ComplementNB (input X) [0m
DecisionTreeClassifier()
DummyClassifier()
ExtraTreeClassifier()
ExtraTreesClassifier()
GaussianNB()
GaussianProcessClassifier()
GradientBoostingClassifier()
HistGradientBoostingClassifier()
KNeighborsClassifier()
LabelPropagation()
LabelSpreading()
LinearDiscriminantAnalysis()
LinearSVC()
LogisticRegression()




LogisticRegressionCV()
MLPClassifier()
[31m __init__() missing 1 required positional argument: 'estimator' [0m
[31m Negative values in data passed to MultinomialNB (input X) [0m
NearestCentroid()
NuSVC()
[31m __init__() missing 1 required positional argument: 'estimator' [0m
[31m __init__() missing 1 required positional argument: 'estimator' [0m
[31m __init__() missing 1 required positional argument: 'estimator' [0m
PassiveAggressiveClassifier()
Perceptron()
QuadraticDiscriminantAnalysis()
RadiusNeighborsClassifier()
RandomForestClassifier()
RidgeClassifier()
RidgeClassifierCV()
SGDClassifier()
SVC()
[31m __init__() missing 1 required positional argument: 'estimators' [0m
[31m __init__() missing 1 required positional argument: 'estimators' [0m


