### 보스턴 집값 예측 모델
- datasets: boston.csv
- learning method: supervised learning, regression
- feature len: 13
- label len: 1

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split

In [2]:
FILE_PATH = '../data/boston.csv'

In [3]:
data_df = pd.read_csv(FILE_PATH)
data_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [4]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


- preprocessing
    - data processiong: missing value, duplecate value, outlier --> column별로 고유값 확인
    - 표준화와 정규화 (scaling): case by case
        - 정규분포 datasets을 기반으로 한 모델 --> StandardScaler, Log transfrom
        - feature scaling --> MinMaxScaler, RobustScaler, ...
        - categorical feature --> numeric encoding, one-hot-encoding, ordinal-encoding
        - string label --> label-encoding

In [5]:
# numerical featuer scaling ..?

In [7]:
# feature & label split
feature_df = data_df.iloc[:, :-1]
label_sr = data_df['MEDV']

In [8]:
print(f"feature_df: {feature_df.shape}, label_sr: {label_sr.shape}")

feature_df: (506, 13), label_sr: (506,)


In [12]:
# train & test split
# train : test = 7 : 3
# random_state = 51

X_train, X_test, y_train, y_test = train_test_split(feature_df, label_sr, random_state=51, test_size=0.3)

In [13]:
print(f"X_train: {X_train.shape}, y_train:{y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (354, 13), y_train:(354,)
X_test: (152, 13), y_test: (152,)


In [14]:
print(f"X_train: {len(X_train)/len(feature_df)*100:.2f} %")
print(f"X_test: {len(X_test)/len(feature_df)*100:.2f} %")

X_train: 69.96 %
X_test: 30.04 %


In [15]:
# standardscaling
ss_scaler = StandardScaler()
ss_scaler.fit(X_train, y_train)

In [16]:
X_train_scaled = ss_scaler.transform(X_train)
X_test_scaled = ss_scaler.transform(X_test)

In [18]:
# cross validation
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Ridge

In [19]:
# model instance
ridge_model = Ridge(alpha=1.0)      # default alpha = 1.0

In [21]:
# model learning
# cv : 3
# scoring : 'mean_squared_error', 'R2 Score'
# return_train_score : True
result = cross_validate(ridge_model, X_train_scaled, y_train, cv=3, return_train_score=True,
                        scoring=['neg_mean_squared_error', 'r2'], return_estimator=True)

In [23]:
result_df = pd.DataFrame(result)
result_df

Unnamed: 0,fit_time,score_time,estimator,test_neg_mean_squared_error,train_neg_mean_squared_error,test_r2,train_r2
0,0.000997,0.000997,Ridge(),-16.593098,-25.025887,0.792891,0.684535
1,0.001995,0.000997,Ridge(),-36.228944,-16.715061,0.523944,0.794444
2,0.0,0.001995,Ridge(),-25.340546,-20.719049,0.692636,0.735358


In [47]:
# Hyperparameter control, Tuning
alpha_value=[0., 1., 10, 100]
result_ = pd.DataFrame(columns=['fit_time', 'score_time', 'estimator', 'test_neg_mean_squared_error', 'train_neg_mean_squared_error', 'test_r2', 'train_r2'])

for value in alpha_value:
    ridge_model = Ridge(alpha=value, max_iter=3)
    
    # model learning
    # cv : 3
    # scoring : 'mean_squared_error', 'R2 Score'
    # return_train_score : True
    result = cross_validate(ridge_model, X_train_scaled, y_train, cv=3, return_train_score=True,
                            scoring=['neg_mean_squared_error', 'r2'], return_estimator=True)
    
    result_df = pd.DataFrame(result)[['test_r2', 'train_r2']]
    result_ = pd.concat([result_, pd.DataFrame(result)], ignore_index=True)
    print(f"[Ridge(alpha={value})]")
    print(result_df)
    print()

[Ridge(alpha=0.0)]
    test_r2  train_r2
0  0.789884  0.684607
1  0.524855  0.794461
2  0.690570  0.735447

[Ridge(alpha=1.0)]
    test_r2  train_r2
0  0.792891  0.684535
1  0.523944  0.794444
2  0.692636  0.735358

[Ridge(alpha=10)]
    test_r2  train_r2
0  0.810280  0.680529
1  0.518670  0.793405
2  0.699725  0.731358

[Ridge(alpha=100)]
    test_r2  train_r2
0  0.800240  0.622405
1  0.499208  0.761623
2  0.661604  0.683541



In [49]:
result_['diff'] = round(abs(result_['test_r2'] - result_['train_r2']) * 100, 2)
result_['test_r2'] = round(result_['test_r2']*100, 2)
result_['train_r2'] = round(result_['train_r2']*100, 2)
result_

Unnamed: 0,fit_time,score_time,estimator,test_neg_mean_squared_error,train_neg_mean_squared_error,test_r2,train_r2,diff
0,0.000998,0.000997,"Ridge(alpha=0.0, max_iter=3)",-16.833944,-25.020214,78.99,68.46,10.53
1,0.001995,0.000997,"Ridge(alpha=0.0, max_iter=3)",-36.159634,-16.713663,52.49,79.45,26.96
2,0.000997,0.001995,"Ridge(alpha=0.0, max_iter=3)",-25.510888,-20.712109,69.06,73.54,4.49
3,0.000993,0.001996,Ridge(max_iter=3),-16.593098,-25.025887,79.29,68.45,10.84
4,0.001996,0.000997,Ridge(max_iter=3),-36.228944,-16.715061,52.39,79.44,27.05
5,0.001031,0.000964,Ridge(max_iter=3),-25.340546,-20.719049,69.26,73.54,4.27
6,0.001996,0.001029,"Ridge(alpha=10, max_iter=3)",-15.199925,-25.343724,81.03,68.05,12.98
7,0.002,0.000992,"Ridge(alpha=10, max_iter=3)",-36.630302,-16.799524,51.87,79.34,27.47
8,0.000996,0.000998,"Ridge(alpha=10, max_iter=3)",-24.756121,-21.032221,69.97,73.14,3.16
9,0.001994,0.000997,"Ridge(alpha=100, max_iter=3)",-16.004238,-29.954733,80.02,62.24,17.78


In [59]:
# Hyperparameter control, Tuning
alpha_value=[0., 1., 10, 100]

for value in alpha_value:
    ridge_model = Ridge(alpha=value)
    
    # model learning
    # cv : 3
    # scoring : 'mean_squared_error', 'R2 Score'
    # return_train_score : True
    result = cross_validate(ridge_model, X_train_scaled, y_train, cv=3, return_train_score=True,
                            scoring=['neg_mean_squared_error', 'r2'], return_estimator=True)
    
    result_df = pd.DataFrame(result)[['test_r2', 'train_r2']]
    result_df['diff'] = abs(result_df['test_r2'] - result_df['train_r2'])
    best_idx = result_df['diff'].idxmin()
    
    print(result['estimator'][best_idx].coef_)
    print()
    print(f"[Ridge(alpha= {value})]")
    print(result_df.loc[best_idx, :])
    print()

[-0.48622674  0.48678462  0.55637027 -0.04260264 -2.71061933  2.35505526
  0.92792886 -2.68960025  2.90111968 -2.42614004 -2.83645925  0.70446804
 -3.86478203]

[Ridge(alpha= 0.0)]
test_r2     0.690570
train_r2    0.735447
diff        0.044877
Name: 2, dtype: float64

[-0.46624767  0.45406808  0.47303036 -0.03122467 -2.62365022  2.38032536
  0.89238014 -2.63847415  2.69740292 -2.22818112 -2.79293772  0.70782619
 -3.83101462]

[Ridge(alpha= 1.0)]
test_r2     0.692636
train_r2    0.735358
diff        0.042722
Name: 2, dtype: float64

[-0.36433854  0.2884344   0.04127924  0.02012814 -2.04943569  2.51999092
  0.67701941 -2.26129538  1.65912936 -1.2940466  -2.50248972  0.72510452
 -3.58324676]

[Ridge(alpha= 10)]
test_r2     0.699725
train_r2    0.731358
diff        0.031633
Name: 2, dtype: float64

[-0.34109857  0.15206636 -0.49736181  0.02653052 -0.71497762  2.40655631
  0.16502419 -0.97460068  0.19628207 -0.48197912 -1.61913857  0.64852658
 -2.47430413]

[Ridge(alpha= 100)]
test_r2     0

- Hyperparameter Tuning & Cross Validation

In [60]:
from sklearn.model_selection import GridSearchCV

In [65]:
# Ridge's Hyperparameter
params = {'alpha':[0., 0.1, 0.5, 1.0], 'max_iter':[3, 5]}
# 총 8개의 model이 만들어짐. 조합이 되어야 하니까


In [67]:
# create gridsearch instance
r_model = Ridge()
search_cv = GridSearchCV(r_model, params, cv=3, verbose=True, return_train_score=True)      # 진행상황과 train 결과를 같이 보고 싶음

In [68]:
# learning
search_cv.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [69]:
# after learning
search_cv.best_estimator_, search_cv.best_index_, search_cv.best_params_, search_cv.multimetric_, search_cv.n_features_in_, search_cv.n_splits_

(Ridge(max_iter=3),
 6,
 {'alpha': 1.0, 'max_iter': 3},
 {'alpha': 1.0, 'max_iter': 3},
 False,
 13,
 3)

In [73]:
# 등등 많은 속성들이 생김
# 한번에 다보는 방법

In [71]:
result_df = pd.DataFrame(search_cv.cv_results_)
result_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.001333,0.0004661134,0.000662,0.0004679041,0.0,3,"{'alpha': 0.0, 'max_iter': 3}",0.789884,0.524855,0.69057,0.668436,0.109324,7,0.684607,0.794461,0.735447,0.738172,0.044889
1,0.000998,3.371748e-07,0.000997,8.485379e-07,0.0,5,"{'alpha': 0.0, 'max_iter': 5}",0.789884,0.524855,0.69057,0.668436,0.109324,7,0.684607,0.794461,0.735447,0.738172,0.044889
2,0.000998,2.247832e-07,0.000666,0.0004706406,0.1,3,"{'alpha': 0.1, 'max_iter': 3}",0.790199,0.524759,0.6908,0.668586,0.109498,5,0.684606,0.794461,0.735446,0.738171,0.044889
3,0.000998,1.94668e-07,0.000332,0.000470134,0.1,5,"{'alpha': 0.1, 'max_iter': 5}",0.790199,0.524759,0.6908,0.668586,0.109498,5,0.684606,0.794461,0.735446,0.738171,0.044889
4,0.000997,1.123916e-07,0.0,0.0,0.5,3,"{'alpha': 0.5, 'max_iter': 3}",0.791426,0.524387,0.691664,0.669159,0.110174,3,0.684588,0.794457,0.735423,0.738156,0.044895
5,0.000998,1.94668e-07,0.0,0.0,0.5,5,"{'alpha': 0.5, 'max_iter': 5}",0.791426,0.524387,0.691664,0.669159,0.110174,3,0.684588,0.794457,0.735423,0.738156,0.044895
6,0.000997,5.61958e-07,0.000665,0.0004700779,1.0,3,"{'alpha': 1.0, 'max_iter': 3}",0.792891,0.523944,0.692636,0.669824,0.110976,1,0.684535,0.794444,0.735358,0.738112,0.044912
7,0.00133,0.000469348,0.000997,1.94668e-07,1.0,5,"{'alpha': 1.0, 'max_iter': 5}",0.792891,0.523944,0.692636,0.669824,0.110976,1,0.684535,0.794444,0.735358,0.738112,0.044912


In [74]:
# best model 
best_model = search_cv.best_estimator_
best_model

In [75]:
# 몇개 없는 데이터라서 빨리끝남