### 교차 검증 단순화 
- sklearn.leran의 model_selection 모듈 내에 모델 검증관련 기능 활용
- 교차 검증 데이터 기반 검증 결과 처리

[1] 모듈 로딩 및 데이터 준비

In [120]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [121]:
# 생선 데이터 준비
fishDF = pd.read_csv('../data/fish.csv')
fishDF.info()

# 붓꽃 데이터 준비
irisDF = pd.read_csv('../data/iris.csv')
irisDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Species   159 non-null    object 
 1   Weight    159 non-null    float64
 2   Length    159 non-null    float64
 3   Diagonal  159 non-null    float64
 4   Height    159 non-null    float64
 5   Width     159 non-null    float64
dtypes: float64(5), object(1)
memory usage: 7.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


[2] 데이터 준비 => 피쳐 & 타겟 분리 <hr>

In [122]:
# 타겟 : Weight , 피쳐 : Length Diagonal Height Width
fish_targetSR = fishDF['Weight']
fish_featureDF = fishDF[fishDF.columns[2:]]

fish_featureDF.head(), fish_targetSR.head()

(   Length  Diagonal   Height   Width
 0    25.4      30.0  11.5200  4.0200
 1    26.3      31.2  12.4800  4.3056
 2    26.5      31.1  12.3778  4.6961
 3    29.0      33.5  12.7300  4.4555
 4    29.0      34.0  12.4440  5.1340,
 0    242.0
 1    290.0
 2    340.0
 3    363.0
 4    430.0
 Name: Weight, dtype: float64)

In [123]:
# 타겟 : variety , 피쳐 : sepal.length sepal.width petal.length petal.width
iris_targetSR = irisDF['variety']
iris_featureDF = irisDF[irisDF.columns[:-1]]

iris_featureDF.head(), iris_targetSR.head()

(   sepal.length  sepal.width  petal.length  petal.width
 0           5.1          3.5           1.4          0.2
 1           4.9          3.0           1.4          0.2
 2           4.7          3.2           1.3          0.2
 3           4.6          3.1           1.5          0.2
 4           5.0          3.6           1.4          0.2,
 0    Setosa
 1    Setosa
 2    Setosa
 3    Setosa
 4    Setosa
 Name: variety, dtype: object)

[3] 데이터 전처리 <hr>
- 피쳐 스케일링

In [124]:
# 학습용 테스트용 데이터 분리
from sklearn.model_selection import train_test_split

# 회귀
fish_xtrain, fish_xtest, fish_ytrain, fish_ytest = train_test_split(fish_featureDF, fish_targetSR, test_size=0.2, random_state=5)

# 분류
iris_xtrain, iris_xtest, iris_ytrain, iris_ytest = train_test_split(iris_featureDF, iris_targetSR,test_size=0.2, random_state=5, stratify=iris_targetSR)

In [125]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# 생선 데이터셋 피쳐 스케일링
fish_scaler = StandardScaler()
fish_scaler.fit(fish_xtrain)
scaled_fish_xtrain = fish_scaler.transform(fish_xtrain)
scaled_fish_xtest = fish_scaler.transform(fish_xtest)

In [126]:
# 붓꽃 데이터셋 피쳐 스케일링
iris_scaler = StandardScaler()
iris_scaler.fit(iris_xtrain)
scaled_iris_xtrain = iris_scaler.transform(iris_xtrain)
scaled_iris_xtest = iris_scaler.transform(iris_xtest)

[4] 학습<hr>
[4-1] 생선 무게 예측 모델

In [127]:
# 모듈 로딩
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate

In [128]:
# 교차 검증으로 학습 진행
# => 준비 : 모델 인스턴스, 학습용 피쳐 데이터, 학습용 라벨 데이터
lr_model = LinearRegression()

In [129]:
# 학습/검증에 대한 평가 모든 결과에 대한 처리
result = cross_validate(lr_model,
                        scaled_fish_xtrain,
                        fish_ytrain, 
                        return_train_score=True,
                        return_estimator=True,
                        scoring=('r2','neg_mean_squared_error'))  # score만 보고 싶으면 cross_val_score 사용
print(result)  # 기본 cv값 = 5 라서 결과값 다섯개 나옴

{'fit_time': array([0.00209975, 0.00099683, 0.00108433, 0.00112247, 0.0010519 ]), 'score_time': array([9.97543335e-04, 7.79628754e-05, 0.00000000e+00, 9.99450684e-04,
       0.00000000e+00]), 'estimator': [LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression()], 'test_r2': array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905]), 'train_r2': array([0.87426416, 0.88779401, 0.88061108, 0.90297504, 0.89833592]), 'test_neg_mean_squared_error': array([ -8767.84902315, -17815.75093903, -12344.87825138, -22006.47049028,
       -39450.52608702]), 'train_neg_mean_squared_error': array([-16078.44783606, -13972.57866943, -15268.42472495, -13223.98109532,
       -10586.01039978])}


In [130]:
resultDF = pd.DataFrame(result)
resultDF # 행의 수 : cv 수

Unnamed: 0,fit_time,score_time,estimator,test_r2,train_r2,test_neg_mean_squared_error,train_neg_mean_squared_error
0,0.0021,0.000998,LinearRegression(),0.921047,0.874264,-8767.849023,-16078.447836
1,0.000997,7.8e-05,LinearRegression(),0.843854,0.887794,-17815.750939,-13972.578669
2,0.001084,0.0,LinearRegression(),0.885924,0.880611,-12344.878251,-15268.424725
3,0.001122,0.000999,LinearRegression(),0.64672,0.902975,-22006.47049,-13223.981095
4,0.001052,0.0,LinearRegression(),0.790319,0.898336,-39450.526087,-10586.0104


In [131]:
best_model = resultDF.iloc[0]['estimator']
best_model.coef_, best_model.intercept_

(array([ 373.98470744, -159.77931033,   90.53431501,   50.22123874]),
 408.52250924970195)

In [132]:
### CV에 score만 추출
cross_val_score(lr_model, scaled_fish_xtrain,fish_ytrain)

array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905])

In [133]:
## CV에 predict만 추출
cross_val_predict(lr_model, scaled_fish_xtrain,fish_ytrain)

array([ 9.09792517e+01,  9.85612151e+01,  3.87029719e+02,  1.13011547e+02,
        6.81676563e+02,  2.82456988e+02,  5.34379642e+02,  3.61848302e+02,
        6.12934598e+02,  1.70756130e+02,  5.53222970e+02,  1.69433076e+01,
       -2.53895688e+01,  8.14926155e+02,  6.97225129e+01,  3.38157931e+02,
        4.76306355e+02,  7.67659158e+02,  6.55686457e+02,  1.80300946e+02,
        8.45315559e+02,  2.92145322e+02,  6.08539351e+02,  9.02782406e+02,
        6.99788981e+02,  9.40316876e+02,  7.47628344e+02,  3.28419355e+02,
        7.89622699e+02,  9.09130831e+02, -1.98986854e+02,  1.81089559e+02,
        6.36731679e+02, -1.09209894e+02,  3.57087822e+02,  7.88250361e+02,
        3.25180589e+02,  6.56473977e+02, -2.37032025e+02,  4.55882834e+01,
        9.57130255e+01, -2.10830505e+02,  1.28969696e+02, -2.21199132e+02,
       -1.10282630e+02,  6.39911566e+02,  2.12288357e+02,  2.41098815e+02,
        2.61932359e+02, -2.58301758e+02,  2.93250859e+01,  8.87950700e+02,
        2.46460034e+02,  

### 교차검증과 튜닝까지 한꺼번에 진행
- 단점 : 시간이 오래 걸림

In [134]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [150]:
est = LogisticRegression(max_iter=10000, solver='liblinear')
pram = {'penalty':['l2','l1']}
        # ,'solver':['lbfgs','liblinear', 'library', 'newton-cg', 'sag', 'saga']} # 안되는 조합 확인해서 돌려야함

In [152]:
gscv = GridSearchCV(est, param_grid=pram,return_train_score=True)
gscv.fit(scaled_iris_xtrain,iris_ytrain)

In [154]:
cv_resultsDF = pd.DataFrame(gscv.cv_results_)
cv_resultsDF

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.001998,0.00046,0.00088,0.000451,l2,{'penalty': 'l2'},0.875,0.958333,0.958333,0.958333,...,0.916667,0.052705,2,0.90625,0.916667,0.916667,0.927083,0.90625,0.914583,0.007795
1,0.00159,0.000622,0.000897,0.000454,l1,{'penalty': 'l1'},0.875,1.0,0.958333,0.958333,...,0.941667,0.042492,1,0.9375,0.927083,0.9375,0.947917,0.9375,0.9375,0.006588


In [156]:
gscv.best_estimator_, gscv.best_params_, gscv.best_score_

(LogisticRegression(max_iter=10000, penalty='l1', solver='liblinear'),
 {'penalty': 'l1'},
 0.9416666666666668)

In [195]:
best_model = gscv.best_estimator_
best_model

### 데이터에 적합한 모델 찾기

In [196]:
from sklearn.utils.discovery import all_estimators

In [205]:
models = all_estimators('classifier')


for model_name, model in models:
    try:
        print(model().fit(scaled_iris_xtrain,iris_ytrain))
    except Exception as e:  # if 로지스틱이면 맥스 이터레이터 값 넣어주기
        print(e)

AdaBoostClassifier()
BaggingClassifier()
BernoulliNB()
CalibratedClassifierCV()
Negative values in data passed to CategoricalNB (input X)
__init__() missing 1 required positional argument: 'base_estimator'
Negative values in data passed to ComplementNB (input X)
DecisionTreeClassifier()
DummyClassifier()
ExtraTreeClassifier()
ExtraTreesClassifier()
GaussianNB()
GaussianProcessClassifier()
GradientBoostingClassifier()
HistGradientBoostingClassifier()
KNeighborsClassifier()
LabelPropagation()
LabelSpreading()
LinearDiscriminantAnalysis()
LinearSVC()
LogisticRegression()




LogisticRegressionCV()
MLPClassifier()
__init__() missing 1 required positional argument: 'estimator'
Negative values in data passed to MultinomialNB (input X)
NearestCentroid()
NuSVC()
__init__() missing 1 required positional argument: 'estimator'
__init__() missing 1 required positional argument: 'estimator'
__init__() missing 1 required positional argument: 'estimator'
PassiveAggressiveClassifier()
Perceptron()
QuadraticDiscriminantAnalysis()
RadiusNeighborsClassifier()
RandomForestClassifier()
RidgeClassifier()
RidgeClassifierCV()
SGDClassifier()
SVC()
__init__() missing 1 required positional argument: 'estimators'
__init__() missing 1 required positional argument: 'estimators'


