### 교차 검증 단순화
- scikit-learn의 model_selection 모듈 내에 모델 검증 관련 기능 활용
- 교차 검증 데이터 기반 검증 결과 처리

[1] 모듈 로딩 및 데이터 준비 <hr>

In [51]:
import pandas as pd

# 생선 데이터 준비 
fishDF = pd.read_csv('../data/fish.csv')

# 붓꽃 데이터 준비
irisDF = pd.read_csv('../data/iris.csv', header = None)

In [52]:
fishDF.head(2)

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056


[2] 데이터 준비 => 피쳐 & 타겟 분리 <hr>

In [53]:
# 타겟 : Weight, 피처 : Length, Diagonal, Height, Width
fish_targetSR = fishDF[fishDF.columns[1]]
fish_featureDF = fishDF[fishDF.columns[2:]]

In [54]:
# 타겟 : 4번 컬럼, 피처 : 0 ~ 3번 컬럼
iris_targetSR = irisDF[irisDF.columns[4]]
iris_featureDF = irisDF[irisDF.columns[:4]]
iris_featureDF.head(2)

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2


[3] 데이터 전처리 <hr>
- 피처 스케일링

In [55]:
# 학습용 테스트용 데이터 분리
from sklearn.model_selection import train_test_split

# 생선 데이터 => 학습용 데이터셋, 테스트용 데이터셋 ===> 회귀
fish_X_train, fish_X_test, fish_y_train, fish_y_test = train_test_split(fish_featureDF, fish_targetSR, test_size = 0.2, random_state=5)

# iris 데이터 => 학습용 데이터셋, 테스트용 데이터셋 ===> 분류
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_featureDF, iris_targetSR, test_size = 0.2,
                                                                        random_state=5, stratify=iris_targetSR)

In [56]:
from sklearn.preprocessing import StandardScaler

# 생선 데이터셋 피처 스케일링
fish_scaler = StandardScaler()
fish_scaler.fit(fish_X_train)

scaled_fish_X_train = fish_scaler.transform(fish_X_train)
scaled_fish_X_test = fish_scaler.transform(fish_X_test)

In [57]:
# 생선 데이터셋 피처 스케일링
iris_scaler = StandardScaler()
iris_scaler.fit(iris_X_train)

scaled_iris_X_train = iris_scaler.transform(iris_X_train)
scaled_iris_X_test = iris_scaler.transform(iris_X_test)

[4] 학습 <hr>
[4-1] 생선 무게 예측 모델

In [58]:
# 모듈 로딩
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict

In [59]:
# 교차 검증으로 학습 진행
# => 준비 : 모델 인스턴스, 학습용 피처 데이터, 학습용 라벨 데이터
lr_model = LinearRegression()

In [60]:
# 학습/검증에 대한 평가 모든 결과에 대한 처리
result = cross_validate(lr_model, 
                        scaled_fish_X_train, 
                        fish_y_train,
                        scoring = ('r2', 'neg_mean_squared_error'),
                        return_train_score=True,
                        return_estimator=True,
                        ) 
# cv=3 이라는 것은 3등분 했다는 소리

print(result)

{'fit_time': array([0.00105572, 0.00100088, 0.00103784, 0.00104713, 0.00109315]), 'score_time': array([0.00212836, 0.        , 0.00103021, 0.        , 0.0010004 ]), 'estimator': [LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression()], 'test_r2': array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905]), 'train_r2': array([0.87426416, 0.88779401, 0.88061108, 0.90297504, 0.89833592]), 'test_neg_mean_squared_error': array([ -8767.84902315, -17815.75093903, -12344.87825138, -22006.47049028,
       -39450.52608702]), 'train_neg_mean_squared_error': array([-16078.44783606, -13972.57866943, -15268.42472495, -13223.98109532,
       -10586.01039978])}


In [61]:
result.keys()

dict_keys(['fit_time', 'score_time', 'estimator', 'test_r2', 'train_r2', 'test_neg_mean_squared_error', 'train_neg_mean_squared_error'])

In [62]:
result.values()

dict_values([array([0.00105572, 0.00100088, 0.00103784, 0.00104713, 0.00109315]), array([0.00212836, 0.        , 0.00103021, 0.        , 0.0010004 ]), [LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression()], array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905]), array([0.87426416, 0.88779401, 0.88061108, 0.90297504, 0.89833592]), array([ -8767.84902315, -17815.75093903, -12344.87825138, -22006.47049028,
       -39450.52608702]), array([-16078.44783606, -13972.57866943, -15268.42472495, -13223.98109532,
       -10586.01039978])])

In [63]:
resultDF = pd.DataFrame(result)
resultDF

Unnamed: 0,fit_time,score_time,estimator,test_r2,train_r2,test_neg_mean_squared_error,train_neg_mean_squared_error
0,0.001056,0.002128,LinearRegression(),0.921047,0.874264,-8767.849023,-16078.447836
1,0.001001,0.0,LinearRegression(),0.843854,0.887794,-17815.750939,-13972.578669
2,0.001038,0.00103,LinearRegression(),0.885924,0.880611,-12344.878251,-15268.424725
3,0.001047,0.0,LinearRegression(),0.64672,0.902975,-22006.47049,-13223.981095
4,0.001093,0.001,LinearRegression(),0.790319,0.898336,-39450.526087,-10586.0104


In [64]:
best_model = resultDF.iloc[0]['estimator']
best_model.coef_, best_model.intercept_

(array([ 373.98470744, -159.77931033,   90.53431501,   50.22123874]),
 408.52250924970195)

In [65]:
### CV에 score만 추출
cross_val_score(lr_model, scaled_fish_X_train, fish_y_train)

array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905])