# Iris
- 학습 데이터가 적을 때 활용되는 K-fold를 사용
- 2가지 방식의 모델(Multi-layer perceptron과 LinearRegression)으로 수행

In [3]:
import os
import pandas as pd
import numpy as np

# Multi-layer perceptron 모델 구성
from sklearn.neural_network import MLPRegressor

# LinearRegression 모델 구성
from sklearn.linear_model import LinearRegression

# K-fold
from sklearn.model_selection import KFold

# Data

In [4]:
cwd = os.getcwd()
print('Current Working Directory: ', cwd)

Current Working Directory:  C:\Users\User\Desktop\workspace\DACON\Iris


In [6]:
iris_path = os.path.join(cwd, 'data', 'iris_train.csv')
train_df = pd.read_csv(iris_path)

In [7]:
train_df.head()

Unnamed: 0,id,species,sepal length (cm),petal length (cm),sepal width (cm),petal width (cm)
0,0,setosa,4.4,1.4,2.9,0.2
1,1,versicolor,6.4,4.5,3.2,1.5
2,2,virginica,6.2,4.8,2.8,1.8
3,3,virginica,7.2,6.1,3.6,2.5
4,4,setosa,4.9,1.4,3.0,0.2


In [8]:
# 문자형 0,1,2로 변경
# One-hot encoding 하는게 더 성능 좋을 듯
train_df['species'] = train_df['species'].map({'setosa':0, 'versicolor': 1, 'virginica': 2})

X, Y = train_df.iloc[:,1:4] , train_df.iloc[:, 4:]

# K-fold
- 데이터 양이 적다 생각하여 K-fold 기법을 고려
- K-fold란 학습 데이터를 K개로 분할하여 K-1 개로 학습하고 나머지 갯수로 평가하는 방법

In [10]:
from IPython.display import Image
Image(url='https://upload.wikimedia.org/wikipedia/commons/thumb/b/b5/K-fold_cross_validation_EN.svg/521px-K-fold_cross_validation_EN.svg.png')

In [11]:
def MAE(true, pred):
    score = np.mean(np.abs(true-pred))
    return score


model_arr = []
MAE_error_record = []

split_count = 5 # 5개로 분할

kf = KFold(n_splits= split_count)
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):

    x_train, x_test = X.values[train_index], X.values[test_index]
    y_train, y_test = Y.values[train_index], Y.values[test_index]
    
    # multi-layer perceptron model. 으로 해봤을때 성능 확인
    # created_model = MLPRegressor().fit(x_train, y_train)

    # LinearRegression 으로 확인
    created_model = LinearRegression().fit(x_train, y_train)
    
    pred = created_model.predict(x_test)
    
    MAE_error_record.append(MAE(y_test, pred))
        
    model_arr.append(created_model)

In [12]:
print(model_arr)
print(MAE_error_record)

[LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression()]
[0.1980414907582519, 0.18687865136313336, 0.19352196334840513, 0.2173408938069676, 0.18507301564446058]


In [14]:
# K-fold의 분류 수 만큼, 모델을 생성 한 뒤 모델 중 가장 MSE Error 값이 낮은 모델을 선정
find_model_num = np.where(MAE_error_record == np.min(MAE_error_record))[0][0]
print(f"Find model Num :  {find_model_num} ")

final_model = model_arr[find_model_num]

Find model Num :  4 


In [None]:
total_result = np.zeros_like(y_result)

for each_model in model_arr:
    
    each_result = each_model.predict(x_result)
    
    total_result += each_result
    
total_result /= split_cou

In [16]:
# test 결과 값 추정

test_path = os.path.join(cwd, 'data', 'iris_test.csv')
result_df = pd.read_csv(test_path)
result_df['species'] = result_df['species'].map({'setosa':0, 'versicolor': 1, 'virginica': 2})

x_result = result_df.iloc[:,1:4]

y_result = final_model.predict(x_result)

In [18]:
submission_path = os.path.join(cwd, 'data', 'sample_submission.csv')
submission = pd.read_csv(submission_path)
submission['sepal width (cm)'] = y_result[:,0]
submission['petal width (cm)'] = y_result[:,1]

In [19]:
submission

Unnamed: 0,id,sepal width (cm),petal width (cm)
0,0,3.417809,0.270480
1,1,3.673326,0.193732
2,2,3.438741,0.214456
3,3,3.162292,0.347227
4,4,2.858291,1.779679
...,...,...,...
70,70,2.898277,1.249416
71,71,2.990503,0.199879
72,72,2.715916,2.008955
73,73,2.885645,0.979658


In [22]:
submission.to_csv("kfold_submission.csv", index=False)

추가, 본 코드에서는 MAE 값이 가장 작은 모델을 선정하였지만.

모든 모델의 predict 값을 낸 뒤 평균을 내는 방법도 있습니다.

In [24]:
total_result = np.zeros_like(y_result)

for each_model in model_arr:
    
    each_result = each_model.predict(x_result)
    
    total_result += each_result
    
total_result /= split_count

In [26]:
submission_path = os.path.join(cwd, 'data', 'sample_submission.csv')
submission2 = pd.read_csv(submission_path)
submission2['sepal width (cm)'] = y_result[:,0]
submission2['petal width (cm)'] = y_result[:,1]

# submission2.to_csv("result2.csv", index=False)