### 교차 검증
- 부족한 데이터 셋 및 특정 데이터에 과대적합되는 문제를 해결하기 위한 방안.
- 학습 데이터셋을 일정 크기의 데이터로 n개 분리 후 1/n은 검증용, 나머지는 학습용으로 사용.

[1] 모듈 로딩 및 데이터 준비 <hr> 

In [1]:
import numpy as np
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])

In [2]:
# Kfold 인스턴스 생성 => 데이터를 2개로 분할해주는 객체
k_fold = KFold(n_splits=2)

In [3]:
# 데이터 분할
dtas = k_fold.split(X, y) # 그래서 generator가 뭔데..>?
for dta in dtas:
    print(dta)


dtas = k_fold.split(X) 
for trn, tst in dtas:
    print(trn, tst)

(array([2, 3]), array([0, 1]))
(array([0, 1]), array([2, 3]))
[2 3] [0 1]
[0 1] [2 3]


In [4]:
## perch.csv 파일 데이터 기본 5등분
import pandas as pd
from sklearn.linear_model import LogisticRegression
perchDF = pd.read_csv('../data/perch3.csv')
perchDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Weight  56 non-null     float64
 1   Length  56 non-null     float64
 2   Height  56 non-null     float64
 3   Width   56 non-null     float64
dtypes: float64(4)
memory usage: 1.9 KB


In [5]:
# perchDF = > 5등분
fold_5 = KFold(n_splits=3)
datasets = fold_5.split(perchDF)

for idx, (trn, tst) in enumerate(datasets) :
    print(f'{idx} => {trn.shape} {tst.shape}')

0 => (37,) (19,)
1 => (37,) (19,)
2 => (38,) (18,)


In [6]:
# perchDF = > 5등분 (불균형 데이터일때는 Kfold 못씀.)
fold_5 = KFold(n_splits=5, shuffle=True)
datasets = fold_5.split(perchDF)

for idx, (trn, tst) in enumerate(datasets) :
    print(f'[{idx}] => {trn} {tst}')
    print()


[0] => [ 0  1  2  3  4  5  6  8  9 10 12 14 15 16 17 18 19 21 22 23 24 25 28 29
 30 31 32 33 36 37 38 39 40 41 42 45 47 49 50 51 52 53 54 55] [ 7 11 13 20 26 27 34 35 43 44 46 48]

[1] => [ 0  1  2  3  4  7  8  9 10 11 12 13 14 15 16 18 20 21 22 23 24 25 26 27
 28 30 31 32 34 35 36 37 38 39 40 41 43 44 46 47 48 50 51 53 55] [ 5  6 17 19 29 33 42 45 49 52 54]

[2] => [ 1  2  3  4  5  6  7  9 11 12 13 14 16 17 18 19 20 21 22 23 24 25 26 27
 28 29 31 32 33 34 35 36 38 39 40 42 43 44 45 46 48 49 52 53 54] [ 0  8 10 15 30 37 41 47 50 51 55]

[3] => [ 0  1  2  5  6  7  8  9 10 11 12 13 14 15 17 19 20 21 23 25 26 27 29 30
 31 33 34 35 37 39 40 41 42 43 44 45 46 47 48 49 50 51 52 54 55] [ 3  4 16 18 22 24 28 32 36 38 53]

[4] => [ 0  3  4  5  6  7  8 10 11 13 15 16 17 18 19 20 22 24 26 27 28 29 30 32
 33 34 35 36 37 38 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55] [ 1  2  9 12 14 21 23 25 31 39 40]



In [7]:
## 타겟이 분류인 경우
irisDF = pd.read_csv('../data/iris.csv')

In [8]:
irisDF[irisDF.columns[0]]

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: sepal.length, Length: 150, dtype: float64

In [9]:
k_fold = KFold(n_splits=3)
ret = k_fold.split(irisDF[irisDF.columns[:-1]])

In [10]:
irisDF[irisDF.columns[-1]][[1,2,3]]

1    Setosa
2    Setosa
3    Setosa
Name: variety, dtype: object

In [11]:
# for idx, (trn, tst) in enumerate(ret) :
#     print(f'{idx} : {trn} {tst}')
#     irisDF[irisDF.columns[:-1]]
#     print()

In [16]:
trnScore3 = []
#for p_name in ['l1', 'l2', 'elasticnet'] :
k_fold = KFold(n_splits=3, shuffle=True)
ret = k_fold.split(irisDF[irisDF.columns[:-1]])

for idx, (trn, tst) in enumerate(ret) :
    # 학습용, 테스트용 인덱스 반환
    trn_idx = trn.tolist()
    tst_idx = tst.tolist()
    
    # 인덱스에 해당하는 데이터셋 추출
    trnDF = irisDF.iloc[trn_idx]
    tstDF = irisDF.iloc[tst_idx]
    
    print(trnDF['variety'].value_counts()/trnDF.shape[0])
    print(tstDF['variety'].value_counts()/tstDF.shape[0])
    print('='*90)
    X_train = trnDF[trnDF.columns[:-1]]
    y_train = trnDF[trnDF.columns[-1]]
    X_test = tstDF[tstDF.columns[:-1]]
    y_test = tstDF[tstDF.columns[-1]]

    # 분류 모델 학습
    lg_mdl = LogisticRegression(max_iter=1000, solver='liblinear')
    lg_mdl.fit(X_train, y_train)

    # 훈련 및 검증용 성능
    trnScore = lg_mdl.score(X_train, y_train)
    tstScore = lg_mdl.score(X_test, y_test)
    trnScore3.append(trnScore)

    # 예측
    pre_y = lg_mdl.predict(X_test)
    trnScore3.append(trnScore)



variety
Versicolor    0.36
Virginica     0.34
Setosa        0.30
Name: count, dtype: float64
variety
Setosa        0.40
Virginica     0.32
Versicolor    0.28
Name: count, dtype: float64
variety
Versicolor    0.34
Virginica     0.34
Setosa        0.32
Name: count, dtype: float64
variety
Setosa        0.36
Versicolor    0.32
Virginica     0.32
Name: count, dtype: float64
variety
Setosa        0.38
Virginica     0.32
Versicolor    0.30
Name: count, dtype: float64
variety
Versicolor    0.40
Virginica     0.36
Setosa        0.24
Name: count, dtype: float64


In [13]:
# 문제 발생 코드들... ㅠㅠㅠ
   

 # trnDF = irisDF['variety'][trn_idx]
    # tstDF = irisDF['variety'][trn_idx]
    # print(trnDF.value_counts())
    #print(trnDF['variety'].value_counts()/trnDF.shape[0])
    #print(tstDF['variety'].value_counts()/tstDF.shape[0])

    # X_train = trnDF[trnDF.columns[:-1]]
    # y_train = trnDF[trnDF.columns[-1]]
    # X_test = tstDF[tstDF.columns[:-1]]
    # y_test = tstDF[tstDF.columns[-1]]

    # # 분류 모델 학습
    # lg_mdl = LogisticRegression(max_iter=1000, solver='liblinear')
    # lg_mdl.fit(X_train, y_train)

    # # 훈련 및 검증용 성능
    # trnScore = lg_mdl.score(X_train, y_train)
    # tstScore = lg_mdl.score(X_test, y_test)
    # trnScore3.append(trnScore)

In [14]:
sum(trnScore3)/3

0.9899999999999999

In [15]:
#sklearn.model_selection.cross_val_score : 점수 평가.