### 보스턴 집값 예측 모델
- 데이터셋 : boston.csv
- 학습방법 : 지도학습 => 회귀
- 피쳐/독립 : 13개
- 타겟/종속 : 1개

[1] 데이터 준비

In [1]:
# 모듈 로딩
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib

Using matplotlib backend: <object object at 0x0000013F95A5B140>


In [2]:
# 데이터
FILE_PATH = '../Data/boston.csv'

In [3]:
# CSV => DataFrame
bostonDF = pd.read_csv(FILE_PATH)
bostonDF.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7


In [4]:
# 데이터의 기본 정보 확인
bostonDF.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


[2] 전처리
- [2-1] 데이터 정제


##### 결측치, 이상치, 컬럼별 고유값 추출로 이상 데이터 체크

In [5]:
# 데이터의 기본 정보 확인2
bostonDF.dtypes

CRIM       float64
ZN         float64
INDUS      float64
CHAS         int64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD          int64
TAX        float64
PTRATIO    float64
B          float64
LSTAT      float64
MEDV       float64
dtype: object

In [6]:
bostonDF.isnull().sum().sum()

0

- [2-2] 표준화 & 정규화 ===> 진행여부에 따라 성능의 변화는 경우에 따라 다름!!
    - 정규분포 데이터셋을 기반으로 한 모델 ==> StandardScaler, MinMaxScaler, Log 변환
    - 피쳐의 값의 범위 차이를 줄이기 ==> 피쳐 스케일링, MinMaxScaler, RobustScaler, ...
    - 범주형 피처 ==> 수치화 인코딩 OneHotEncoder, OrdinalEncoder
    - 문자열 타겟 ==> 정수 라벨인코딩 LabelEncoder

- [2-3] 피쳐와 타겟 분리

In [7]:
featureDF = bostonDF.iloc[:, :-1]
targetSR = bostonDF['MEDV']

In [8]:
print(f'featureDF.shape: {featureDF.shape}, targetSR.shape: {targetSR.shape}')

featureDF.shape: (506, 13), targetSR.shape: (506,)


[3] 학습준비

- [3-1] 학습용 데이터셋과 테스트용 데이터셋 분리

In [9]:
x_train, x_test, y_train, y_test = train_test_split(featureDF, targetSR, random_state=10)

In [10]:
print(f'x_train: {x_train.shape}, {x_train.ndim}D')
print(f'y_train: {y_train.shape}, {y_train.ndim}D')
print(f'x_test: {x_test.shape}, {x_test.ndim}D')
print(f'y_test: {y_test.shape}, {y_test.ndim}D')

x_train: (379, 13), 2D
y_train: (379,), 1D
x_test: (127, 13), 2D
y_test: (127,), 1D


- [3-2] 학습용 데이터셋으로 스케일러 생성

In [11]:
### - 수치 피쳐 값의 범위 차가 큼 ==> Scaling 진행
sdScaler = StandardScaler()
sdScaler.fit(x_train)

In [12]:
x_train_scaled = sdScaler.transform(x_train)
x_test_scaled = sdScaler.transform(x_test)

[4] 학습진행 ==> 교차검증으로 진행

In [13]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Ridge

In [14]:
### 모델의 성능을 좌우하는 Hyper-parameter 제어 즉, 튜닝
alpha_value = [0., 1., 10, 100]


for value in alpha_value:

    # 모델 인스턴스 생성
    ridge_model = Ridge(alpha=value, max_iter=3) # alpha 기본값: 1.0

    # 학습 진행
    # - cv: 3개
    # - scoring: 'mean_squared_error', 'r2'
    # - return_train_score
    result=cross_validate(ridge_model,
                        x_train_scaled, y_train,
                        cv=3,
                        scoring=['neg_mean_squared_error', 'r2'],
                        return_train_score=True,
                        return_estimator=True)

    # resultDF=pd.DataFrame(result)

    resultDF=pd.DataFrame(result)[['test_r2', 'train_r2']]
    resultDF['diff']=abs(resultDF['test_r2']-resultDF['train_r2'])
    resultDF['diff'].sort_values()
    print(f'Ridge(alpha: [{value}])')
    print(result['estimator'][0].coef_)
    # print(resultDF[['test_r2','train_r2','diff']], end='\n\n')
    print(resultDF, end='\n\n')

Ridge(alpha: [0.0])
[-1.41407793  1.56590993  0.15536906  0.65522098 -2.36200159  2.31948624
  0.1173831  -3.59071105  2.71475429 -2.33252925 -1.88390034  1.04036915
 -3.50250877]
    test_r2  train_r2      diff
0  0.747022  0.755720  0.008699
1  0.756482  0.740082  0.016400
2  0.680801  0.786156  0.105355

Ridge(alpha: [1.0])
[-1.39035961  1.53043843  0.11109741  0.6621853  -2.29024619  2.34249774
  0.10030677 -3.52062389  2.57481444 -2.20749462 -1.86406784  1.03607796
 -3.48102887]
    test_r2  train_r2      diff
0  0.748283  0.755663  0.007380
1  0.756292  0.740039  0.016253
2  0.680991  0.786097  0.105106

Ridge(alpha: [10])
[-1.23221033  1.29302258 -0.12737786  0.70280521 -1.80949922  2.48028701
 -0.00860666 -2.99831755  1.75466332 -1.51704375 -1.73434856  1.00368486
 -3.30809117]
    test_r2  train_r2      diff
0  0.753103  0.752474  0.000629
1  0.755100  0.737457  0.017643
2  0.677471  0.783225  0.105755

Ridge(alpha: [100])
[-0.78141029  0.70910255 -0.46407849  0.72503917 -0.69

In [15]:
result

{'fit_time': array([0.        , 0.0010016 , 0.00124049]),
 'score_time': array([0.01037002, 0.00100064, 0.00030708]),
 'estimator': [Ridge(alpha=100, max_iter=3),
  Ridge(alpha=100, max_iter=3),
  Ridge(alpha=100, max_iter=3)],
 'test_neg_mean_squared_error': array([-18.98866561, -25.39020199, -26.46849737]),
 'train_neg_mean_squared_error': array([-24.05087942, -21.95230964, -20.66056325]),
 'test_r2': array([0.72403598, 0.72599269, 0.62733505]),
 'train_r2': array([0.70826865, 0.68662795, 0.74445235])}

In [16]:
resultDF=pd.DataFrame(result)
resultDF.head()

Unnamed: 0,fit_time,score_time,estimator,test_neg_mean_squared_error,train_neg_mean_squared_error,test_r2,train_r2
0,0.0,0.01037,"Ridge(alpha=100, max_iter=3)",-18.988666,-24.050879,0.724036,0.708269
1,0.001002,0.001001,"Ridge(alpha=100, max_iter=3)",-25.390202,-21.95231,0.725993,0.686628
2,0.00124,0.000307,"Ridge(alpha=100, max_iter=3)",-26.468497,-20.660563,0.627335,0.744452


- 하이퍼파라미터 튜닝과 교차 검증을 동시에 진행

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
# Ridge의 hyper-parameter 값 설정
params = {'alpha':[0., 0.1, 0.5, 1.0],
          'max_iter':[3, 5]}

# alpha: 0., max_iter: 3 => model
# alpha: 0., max_iter: 5 => model
# ... 
# alpha 4가지 * max_iter 2가지 
# => 총 8개의 Ridge 모델 생성

In [19]:
# 인스턴스 생성
rmodel = Ridge()

# GridSearchCV 인스턴스 생성
searchGV = GridSearchCV(rmodel, params, cv=3, verbose=True, return_train_score=True)

In [20]:
# 학습 진행
searchGV.fit(x_train_scaled, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [21]:
# fit() 진행 후 모델 파라미터 확인
searchGV.best_params_

{'alpha': 1.0, 'max_iter': 3}

In [24]:
bestmodel = searchGV.best_estimator_
bestmodel

In [28]:
resultDF = pd.DataFrame(searchGV.cv_results_)
resultDF

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.001009,0.000827,0.001542,0.001082939,0.0,3,"{'alpha': 0.0, 'max_iter': 3}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
1,0.000669,0.000473,0.000999,6.743496e-07,0.0,5,"{'alpha': 0.0, 'max_iter': 5}",0.747022,0.756482,0.680801,0.728101,0.033669,7,0.75572,0.740082,0.786156,0.760653,0.019131
2,0.001001,1e-06,0.000665,0.0004705286,0.1,3,"{'alpha': 0.1, 'max_iter': 3}",0.747159,0.756462,0.680831,0.728151,0.033675,5,0.75572,0.740081,0.786156,0.760652,0.019131
3,0.000667,0.000471,0.000333,0.0004713704,0.1,5,"{'alpha': 0.1, 'max_iter': 5}",0.747159,0.756462,0.680831,0.728151,0.033675,5,0.75572,0.740081,0.786156,0.760652,0.019131
4,0.000667,0.000471,0.000678,0.0004793781,0.5,3,"{'alpha': 0.5, 'max_iter': 3}",0.747682,0.756385,0.680927,0.728331,0.033708,3,0.755705,0.74007,0.786141,0.760639,0.019129
5,0.00102,2.1e-05,0.000335,0.0004739554,0.5,5,"{'alpha': 0.5, 'max_iter': 5}",0.747682,0.756385,0.680927,0.728331,0.033708,3,0.755705,0.74007,0.786141,0.760639,0.019129
6,0.001014,7.8e-05,0.0,0.0,1.0,3,"{'alpha': 1.0, 'max_iter': 3}",0.748283,0.756292,0.680991,0.728522,0.033768,1,0.755663,0.740039,0.786097,0.7606,0.019124
7,0.000669,0.000473,0.00035,0.0004947478,1.0,5,"{'alpha': 1.0, 'max_iter': 5}",0.748283,0.756292,0.680991,0.728522,0.033768,1,0.755663,0.740039,0.786097,0.7606,0.019124
