In [1]:
# 정확한 검증을 위한 겹겹이 확인 process 
# cross_val_score : 우리반 내에서 k분할 후 검증
# y_val 예측 : 다른 반과의 검증
# y_test 예측 : 다른 학교와의 검증
# 비교를 위해 최종 모델과 하이퍼파라미터 튜닝 전 모델을 따로 두는것. 익숙해지면 비교과정 
# 생략후 모델을 바로 튜닝한 모델로 선언하면 된다

# 1. 환경준비

In [2]:
# 라이브러리 불러오기 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'

# 2. 데이터 탐색

In [3]:
# 학습용 데이터 불러오기 
path = 'https://bit.ly/InsuranceTrainFile'
data1 = pd.read_csv(path)

In [4]:
# charges : 의료비 
data1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,41,female,31.6,0,no,southwest,6186.127
1,30,male,25.46,0,no,northeast,3645.0894
2,18,female,30.115,0,no,northeast,21344.8467
3,61,female,29.92,3,yes,southeast,30942.1918
4,34,female,27.5,1,no,southwest,5003.853


In [5]:
# 테스트용 데이터 불러오기 
path = 'https://bit.ly/InsuranceTestFile'
data2 = pd.read_csv(path)

data2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


# 3. 데이터 준비/전처리

In [6]:
# 가변수화 
dumm_cols = ['sex','smoker','region']
data1 = pd.get_dummies(data1, columns=dumm_cols, drop_first=True)
data1

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,41,31.600,0,6186.1270,0,0,0,0,1
1,30,25.460,0,3645.0894,1,0,0,0,0
2,18,30.115,0,21344.8467,0,0,0,0,0
3,61,29.920,3,30942.1918,0,1,0,1,0
4,34,27.500,1,5003.8530,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1233,50,30.970,3,10600.5483,1,0,1,0,0
1234,18,31.920,0,2205.9808,0,0,0,0,0
1235,18,36.850,0,1629.8335,0,0,0,1,0
1236,21,25.800,0,2007.9450,0,0,0,0,1


In [7]:
# X, y 분리
target = 'charges'
X = data1.drop(target, axis=1)
y = data1[target]

# 확인
X.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,41,31.6,0,0,0,0,0,1
1,30,25.46,0,1,0,0,0,0
2,18,30.115,0,0,0,0,0,0
3,61,29.92,3,0,1,0,1,0
4,34,27.5,1,0,0,0,0,1


In [8]:
# 학습용, 평가용 분리(test데이터가 실제로 있기에 val로 서술)
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2022)

# 확인
X_train

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
660,22,34.580,2,0,0,0,0,0
765,40,29.900,2,1,0,0,0,1
264,21,21.890,2,0,0,0,1,0
1009,45,20.350,3,1,0,0,1,0
123,19,34.800,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...
951,64,26.410,0,1,0,0,0,0
240,24,27.600,0,0,0,0,0,1
624,50,27.075,1,0,0,0,0,0
173,50,27.455,1,1,0,0,0,0


# 4. 모델링

In [9]:
# 불러오기
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [10]:
# 선언하기
model_dt = DecisionTreeRegressor(random_state=2022)

In [11]:
model_dt.fit(X_train, y_train)

In [12]:
# 예측하기
y_val_pred = model_dt.predict(X_val)

In [13]:
# 평가하기
print(mean_absolute_error(y_val, y_val_pred))
print(r2_score(y_val,y_val_pred))

2815.9559071935487
0.7402200651887789


# 5. 일반화된 성능

In [14]:
# 불러오기
from sklearn.model_selection import cross_val_score

# 성능 예측
cv_score = cross_val_score(model_dt, X_train, y_train, cv=5)

# 결과
print(cv_score)
print('평균 성능:',cv_score.mean())

[0.76717473 0.69912849 0.56154583 0.69504255 0.69270192]
평균 성능: 0.6831187034638899


# 6. 성능 튜닝

In [15]:
from sklearn.model_selection import GridSearchCV
params = {'max_depth':range(1,51)}

model = GridSearchCV(model_dt,
                     params,
                     cv=5,
                     scoring='r2')

In [16]:
model.fit(X_train,y_train)

In [27]:
print(model.cv_results_['mean_test_score'])
print(model.best_params_)
print(model.best_score_)

[0.62067274 0.80447825 0.82705796 0.82884181 0.81032873 0.80291274
 0.77028028 0.74101918 0.71887263 0.70505848 0.70082947 0.69648455
 0.67985764 0.67155479 0.67172779 0.68333931 0.68210962 0.678723
 0.68194033 0.6803748  0.68267447 0.68039856 0.6831187  0.6831187
 0.6831187  0.6831187  0.6831187  0.6831187  0.6831187  0.6831187
 0.6831187  0.6831187  0.6831187  0.6831187  0.6831187  0.6831187
 0.6831187  0.6831187  0.6831187  0.6831187  0.6831187  0.6831187
 0.6831187  0.6831187  0.6831187  0.6831187  0.6831187  0.6831187
 0.6831187  0.6831187 ]
{'max_depth': 4}
0.8288418138536089


In [18]:
# X_val 데이터를 이용한 성능 검증
y_val_pred = model.predict(X_val)
print(r2_score(y_val,y_val_pred))

0.8573316318890312


# 7. 최종 평가

In [19]:
# 평가 데이터 확인
data2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [20]:
# 평가 데이터 가변수화 
dumm_cols = ['sex','smoker','region']
data2 = pd.get_dummies(data2, columns=dumm_cols, drop_first=True)

# 확인
data2.head()

# 문제 : 테스트데이터가 학습 데이터에서는 없던 범주값을 가진다면..? 


Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,0,1,0,0,1
1,18,33.77,1,1,0,0,1,0
2,28,33.0,3,1,0,0,1,0
3,33,22.705,0,1,0,1,0,0
4,32,28.88,0,1,0,1,0,0


In [21]:
# 예측하기 
X_test = data2
y_pred = model.predict(X_test)

In [22]:
y_pred[:10]

array([16441.87781917,  6036.3648442 ,  7713.83146267,  3388.49691576,
        3388.49691576,  3388.49691576,  9334.14831629,  7713.83146267,
        7713.83146267, 14819.39120143])

In [23]:
# 결과 제출
path = 'https://bit.ly/InsuranceTestFile'
final = pd.read_csv(path)
final['charges_pred'] = y_pred

# 확인
final.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges_pred
0,19,female,27.9,0,yes,southwest,16441.877819
1,18,male,33.77,1,no,southeast,6036.364844
2,28,male,33.0,3,no,southeast,7713.831463
3,33,male,22.705,0,no,northwest,3388.496916
4,32,male,28.88,0,no,northwest,3388.496916


In [24]:
# 엑셀로 저장
final.to_excel('InsurancePredict.xls')