# 특성 스케일링(feature scaling)
- 데이터 스케일링(data scaling)이라고도 함
- 하는 이유 : 데이터값의 범위가 다 제각각이기 때문에 범위 차이가 클 경우 데이터를 갖고 모델을 학습할 때 0으로 수렴하거나 무한으로 발산할 수 있음
- 특성들의 단위를 무시할 수 있도록, 특성들의 값의 범위를 비슷하게 만들어줌

정규화, 표준화 참고

# MinMaxScaling (정규화; normalization)
- column 간에 다른 min, max 값을 가지는 경우, 정규화를 통해 최소치 0 / 최대값 1의 척도로 맞추어 주는 것
X' = (X - Xmin) / (Xmax - Xmin)

- 이상치가 존재한다면, 이상치가 극값이 되어 데이터가 아주 좁은 범위에 분포하게 되기 때문에 스케일링 방법으로 적절하지 않음

- 넷플릭스 영화평점 (0점 ~ 10점): [2, 4, 6, 8, 10]
- CGV 영화평점 (0점 ~ 5점): [1, 2, 3, 4, 5]

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
movie = {'netflix': [2, 4, 6, 8, 10], 
         'cgv': [1, 2, 3, 4, 5]
         }

In [3]:
movie = pd.DataFrame(data=movie)
movie

Unnamed: 0,netflix,cgv
0,2,1
1,4,2
2,6,3
3,8,4
4,10,5


In [5]:
from sklearn.preprocessing import MinMaxScaler

In [6]:
mms = MinMaxScaler()
mms.fit(movie)

In [8]:
mmsed = mms.transform(movie)
mmsed

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [0.75, 0.75],
       [1.  , 1.  ]])

In [9]:
pd.DataFrame(mmsed, columns=['naver', 'netflix'])

Unnamed: 0,naver,netflix
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0


# 보스턴 부동산 집값 예측문제


Features(X)

CRIM: 도시별 범죄발생률

ZN: 25,000평을 넘는 토지의 비율

INDUS: 도시별 비상업 지구의 비유

CHAS: 찰스 강의 더미 변수(1 = 강의 경계, 0 = 나머지)

NOX: 일산화질소 농도

RM: 주거할 수 있는 평균 방의개수

AGE: 1940년 이전에 지어진 주택의 비율

DIS: 5개의 고용지원센터까지의 가중치가 고려된 거리

RAD: 고속도로의 접근 용이성에 대한 지표

TAX: 10,000달러당 재산세 비율

PTRATIO: 도시별 교사와 학생의 비율

B: 도시의 흑인 거주 비유

LSTAT: 저소득층의 비율

traget(y)
MEDV : 본인 소유의 주택가격(중앙값) (단위: $1,000)

# 필요한 라이브러리 임폴트

In [10]:
# 불필요한 경고 출력을 방지
import warnings
warnings.filterwarnings('ignore')

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [12]:
# 방법2 캐글에서 직접 다운로드해서 로딩하기
columns = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'price']
hpd_df = pd.read_csv('./data/bostton_house_prices.csv', header=None, delimiter=r'\s+', names=columns)
hpd_df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


# 데이터 정규화 - MinMaxScaler()
- 피처들의 데이터 수준을 맞춰주기 위해서 MinMaxScaler 수행

In [13]:
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

# make extend boston dataset
# Refernce : https://github.com/amueller/mglearn/blob/master/mglearn/datasets.py#L30
def data_pre(df, y):
    X =  df.drop(y, axis=1).values

    X = MinMaxScaler().fit_transform(X)
    #Feature Generation 수행 degree = 2
    #X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)

    y = df[y]
    return X, y

In [14]:
# 특징이 확장된 보스턴 부동산 가격 데이터 불러오기
X, y = data_pre(hpd_df, 'price')
print(X[:2])
print()
print(y[:2])
print('Extended Feature Shape :', X.shape)

[[0.00000000e+00 1.80000000e-01 6.78152493e-02 0.00000000e+00
  3.14814815e-01 5.77505269e-01 6.41606591e-01 2.69203139e-01
  0.00000000e+00 2.08015267e-01 2.87234043e-01 1.00000000e+00
  8.96799117e-02]
 [2.35922539e-04 0.00000000e+00 2.42302053e-01 0.00000000e+00
  1.72839506e-01 5.47997701e-01 7.82698249e-01 3.48961980e-01
  4.34782609e-02 1.04961832e-01 5.53191489e-01 1.00000000e+00
  2.04470199e-01]]

0    24.0
1    21.6
Name: price, dtype: float64
Extended Feature Shape : (506, 13)


In [15]:
from sklearn.model_selection import KFold

num_split = 5
# n_splits : validation split 갯수
# 매개변수 : shuffle = True, random_state = 40 
kf = KFold(n_splits=num_split)  

tot_MSE = 0.0
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # 선형회귀(Linear Regression) 모델 선언하기
    model_lr = LinearRegression()

    # 선형회귀(Linear Regression) 모델 학습하기
    model_lr.fit(X_train, y_train)

    # 테스트 데이터에 대한 예측 수행하기
    y_pred = model_lr.predict(X_test)

    # MSE(Mean Squared Error) 측정 수행하기
    tot_MSE = tot_MSE + mean_squared_error(y_test, y_pred)

# 평균 에러 구하기    
avg_MSE = tot_MSE / num_split 
print('Average MSE :', avg_MSE)
print('Avergae RMSE :', np.sqrt(avg_MSE))

Average MSE : 37.13180746769891
Avergae RMSE : 6.0935874054368755


# KFold 교차검증 + L2 규제 알고리즘

In [16]:
from sklearn.linear_model import Ridge  # L2 규제

num_split = 5
# n_splits : validation split 갯수
# 매개변수 : shuffle = True, random_state = 40 
kf = KFold(n_splits=num_split)

tot_MSE = 0.0
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # 선형회귀(Linear Regression) 모델 선언하기
    #ridge_reg = Ridge(alpha=0.8)
    ridge_reg = Ridge(alpha=0.2)
    #ridge_reg = Ridge(alpha=1)

    # 선형회귀(Linear Regression) 모델 학습하기
    ridge_reg.fit(X_train, y_train)

    # 테스트 데이터에 대한 예측 수행하기
    y_pred = ridge_reg.predict(X_test)

    # MSE(Mean Squared Error) 측정 수행하기
    tot_MSE = tot_MSE + mean_squared_error(y_test, y_pred)

# 평균 에러 구하기    
avg_MSE = tot_MSE / num_split 
print('Average MSE :', tot_MSE)
print('Avergae RMSE :', np.sqrt(tot_MSE))

Average MSE : 177.93446040638358
Avergae RMSE : 13.33920763787653


# KFold 교차검증 + L1 규제 알고리즘

In [19]:
from sklearn.linear_model import Lasso

num_split = 5
kf = KFold(n_splits = num_split)

tot_MSE = 0.0
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # 선형회귀(Linear Regression) 모델 선언하기
    #lasso_reg = Lasso(alpha=0.02)
    #lasso_reg = Lasso(alpha=0.02)
    lasso_reg = Lasso(alpha=0.03)

    # 선형회귀(Linear Regression) 모델 학습하기
    lasso_reg.fit(X_train, y_train)

    # 테스트 데이터에 대한 예측 수행하기
    y_pred = lasso_reg.predict(X_test)

    # MSE(Mean Squared Error) 측정 수행하기
    tot_MSE = tot_MSE + mean_squared_error(y_test, y_pred)

# 평균 에러 구하기    
avg_MSE = tot_MSE / num_split 
print('Average MSE :', avg_MSE)
print('Avergae RMSE :', np.sqrt(avg_MSE))

Average MSE : 35.357229644193765
Avergae RMSE : 5.946194551492052


# KFold 교차검증 + ElasticNet(L1+ L2) 규제 알고리즘

In [21]:
from sklearn.linear_model import ElasticNet

num_split = 5
kf = KFold(n_splits = num_split)

tot_MSE = 0.0
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    elasticnet_reg = ElasticNet(alpha=0.02)
    
    elasticnet_reg.fit(X_train, y_train)
    
    y_pred = elasticnet_reg.predict(X_test)
    
    tot_MSE = tot_MSE + mean_squared_error(y_test, y_pred)
    
avg_MSE = tot_MSE / num_split
print('Average MSE :', avg_MSE)
print('Average RMSE : ', np.sqrt(avg_MSE))

Average MSE : 32.420895381917
Average RMSE :  5.693934964672234


# 최적의 하이퍼 파라미터 찾는 방법

### GridSerachCV 활용
- 모델링시 필요한 하이퍼파라미터를 설정할 때 가장 최적의 파리미터값을 찾아주는 방법중 하나
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html?highlight=gridsearchcv#sklearn.model_selection.GridSearchCV

#### GridSearchCV 주요 파라미터
: estimator, param_grid, scoring=None, n_jobs=None, cv=None, refit=True 등이 있다.

- estimator : 평가할 모델을 전달

- param_grid : 각 파라미터와 시험할 값들을 딕셔너리로 넣기
평가 방법은 scoring으로 측정하며 cv는 기본적으로 KFold의 횟수를 정하는 값

- https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter (평가 방식)

- refit=True : 생성된 GridSearchCV 객체를, 가장 좋은 파라미터를 전달한 estimator로 바꿔줌

In [22]:
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, make_scorer, mean_squared_error

### 모델 정의

In [30]:
model_lasso = Lasso()

## cv 정의

In [23]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=2024)

### 모델의 하이퍼파라미터 탭 정의
 - dict로 정의 ==> 하이퍼파라미터 이름 : {값1 , 값2}

In [32]:
param_grid = {
    'alpha' :[0.01, 0.1, 1, 10, 100],
    'fit_intercept' : [True, False]
}

In [33]:
X, y = data_pre(hpd_df, 'price')

In [34]:
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=2024)

## Grid_Search 객체 생성1

In [35]:
grid_search = GridSearchCV(model_lasso,
                           param_grid=param_grid, 
                           cv=cv,
                           scoring='neg_mean_squared_error',
                           refit=True,
                           return_train_score=True
                           ) 

In [37]:
grid_search.fit(X_train, y_train)

## grid_search 를 통해 학습한 결과의 베스트 값 확인

In [38]:
print("1. 학습모델 best_estimator_ : ", grid_search.best_estimator_)
print("2. 학습모델 best_params_ : ", grid_search.best_params_)
print("3. 학습모델 best_score_MSE : ", -1 * grid_search.best_score_)

1. 학습모델 best_estimator_ :  Lasso(alpha=0.01)
2. 학습모델 best_params_ :  {'alpha': 0.01, 'fit_intercept': True}
3. 학습모델 best_score_MSE :  24.423432219329193


## Grid_Search 객체 생성2

In [40]:
grid_search = GridSearchCV(model_lasso,
                           param_grid=param_grid, 
                           cv=cv,
                           scoring='neg_mean_squared_error',
                           refit=True,
                           return_train_score=True
                           ) 

In [41]:
grid_search.fit(X_train, y_train)
print("1. 학습모델 best_estimator_ : ", grid_search.best_estimator_)
print("2. 학습모델 best_params_ : ", grid_search.best_params_)
print("3. 학습모델 best_score_MSE : ", -1 * grid_search.best_score_)

1. 학습모델 best_estimator_ :  Lasso(alpha=0.01)
2. 학습모델 best_params_ :  {'alpha': 0.01, 'fit_intercept': True}
3. 학습모델 best_score_MSE :  24.423432219329193


## 데이터 준비

In [42]:
# 방법2 캐글에서 직접 다운로드해서 로딩하기
columns = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'price']
hpd_df = pd.read_csv('./data/bostton_house_prices.csv', header=None, delimiter=r'\s+', names=columns)
hpd_df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [44]:
X = hpd_df.drop('price', axis=1)
y = hpd_df['price']

In [45]:
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=2024)

In [46]:
X_train.shape, y_train.shape

((404, 13), (404,))

In [47]:
X_test.shape, y_test.shape

((102, 13), (102,))

## Pipeline 객체 생성

In [59]:
model_lasso = Lasso()

In [60]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', model_lasso)
])

## Grid_Search 객체 생성3

In [63]:
param_grid = dict()
param_grid['lasso__alpha'] = [0.01, 0.1, 1, 10, 100]
param_grid['lasso__fit_intercept'] = [True, False]

In [64]:
grid_search = GridSearchCV(pipeline,
                           param_grid=param_grid, 
                           cv=cv,
                           scoring='neg_mean_squared_error',
                           refit=True,
                           return_train_score=True
                           ) 
grid_search.fit(X_train, y_train)
print("1. 학습모델 best_estimator_ : ", grid_search.best_estimator_)
print("2. 학습모델 best_params_ : ", grid_search.best_params_)
print("3. 학습모델 best_score_MSE : ", -1 * grid_search.best_score_)

1. 학습모델 best_estimator_ :  Pipeline(steps=[('scaler', StandardScaler()), ('lasso', Lasso(alpha=0.01))])
2. 학습모델 best_params_ :  {'lasso__alpha': 0.01, 'lasso__fit_intercept': True}
3. 학습모델 best_score_MSE :  24.41811941921162


In [65]:
scores_df = pd.DataFrame(grid_search.cv_results_)
df_score = scores_df.sort_values(by='mean_test_score', ascending=False)
df_score.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lasso__alpha,param_lasso__fit_intercept,params,split0_test_score,split1_test_score,split2_test_score,...,split22_train_score,split23_train_score,split24_train_score,split25_train_score,split26_train_score,split27_train_score,split28_train_score,split29_train_score,mean_train_score,std_train_score
0,0.002902,0.004874,0.002685,0.005154,0.01,True,"{'lasso__alpha': 0.01, 'lasso__fit_intercept':...",-21.321065,-23.300008,-20.655293,...,-21.476486,-22.464946,-23.128754,-22.763233,-23.317952,-21.747596,-23.415835,-23.539144,-22.452425,0.890741
2,0.003345,0.006255,0.000521,0.002804,0.1,True,"{'lasso__alpha': 0.1, 'lasso__fit_intercept': ...",-21.553126,-23.781988,-20.423276,...,-21.95127,-22.948956,-23.582339,-23.322176,-23.769204,-22.267111,-23.874189,-24.004757,-22.93029,0.894417
4,0.003167,0.005685,0.000962,0.003015,1.0,True,"{'lasso__alpha': 1, 'lasso__fit_intercept': True}",-30.498033,-28.881818,-28.43676,...,-27.41702,-28.837045,-29.41051,-28.729262,-29.49877,-27.783541,-29.819173,-29.767585,-28.593719,1.095093
8,0.002998,0.004063,0.001445,0.002744,100.0,True,"{'lasso__alpha': 100, 'lasso__fit_intercept': ...",-80.246132,-93.200892,-93.011964,...,-76.269824,-80.446539,-84.572115,-82.552136,-81.204964,-76.814938,-83.58512,-82.844446,-80.854833,2.324861
6,0.002329,0.004037,0.001369,0.003845,10.0,True,"{'lasso__alpha': 10, 'lasso__fit_intercept': T...",-80.246132,-93.200892,-93.011964,...,-76.269824,-80.446539,-84.572115,-82.552136,-81.204964,-76.814938,-83.58512,-82.844446,-80.854833,2.324861


In [68]:
df_score.loc[:, ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
0,"{'lasso__alpha': 0.01, 'lasso__fit_intercept':...",-24.418119,8.38825,1
2,"{'lasso__alpha': 0.1, 'lasso__fit_intercept': ...",-24.785233,8.789466,2
4,"{'lasso__alpha': 1, 'lasso__fit_intercept': True}",-29.762052,10.095209,3
8,"{'lasso__alpha': 100, 'lasso__fit_intercept': ...",-81.193168,20.892828,4
6,"{'lasso__alpha': 10, 'lasso__fit_intercept': T...",-81.193168,20.892828,4
3,"{'lasso__alpha': 0.1, 'lasso__fit_intercept': ...",-518.522903,36.842038,6
1,"{'lasso__alpha': 0.01, 'lasso__fit_intercept':...",-518.896044,38.29831,7
5,"{'lasso__alpha': 1, 'lasso__fit_intercept': Fa...",-523.781188,35.371559,8
7,"{'lasso__alpha': 10, 'lasso__fit_intercept': F...",-574.027425,68.846335,9
9,"{'lasso__alpha': 100, 'lasso__fit_intercept': ...",-574.027425,68.846335,9


In [69]:
# grid search linear regression model on the auto insurance dataset
from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [70]:
model = Ridge()

In [76]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=2024)

param_grid = dict()
param_grid['ridge__solver'] = ['svd' , 'cholesky', 'lsqr', 'sag']
param_grid['ridge__alpha'] = [1e-5, 1e-4, 1e-3,1e-2, 1e-1, 1,10,100]
param_grid['ridge__fit_intercept'] = [True, False]

In [77]:
# 파이프라인 :StandardScaler -> Ridge ; 1.x 버전부터 변경
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', model)
])

In [78]:
# define search
grid_search = GridSearchCV(pipeline,
                           param_grid=param_grid, 
                           cv=cv,
                           scoring='neg_mean_squared_error',
                           refit=True,
                           return_train_score=True
                           ) 
grid_search.fit(X_train, y_train)
print("1. 학습모델 best_estimator_ : ", grid_search.best_estimator_)
print("2. 학습모델 best_params_ : ", grid_search.best_params_)
print("3. 학습모델 best_score_MSE : ", -1 * grid_search.best_score_)

1. 학습모델 best_estimator_ :  Pipeline(steps=[('scaler', StandardScaler()),
                ('ridge', Ridge(alpha=1, solver='lsqr'))])
2. 학습모델 best_params_ :  {'ridge__alpha': 1, 'ridge__fit_intercept': True, 'ridge__solver': 'lsqr'}
3. 학습모델 best_score_MSE :  24.426323369058064


In [79]:
# grid search linear regression model on the auto insurance dataset
from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [89]:
model = ElasticNet()

In [90]:
param_grid = dict()
param_grid['elasticnet__alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
param_grid['elasticnet__l1_ratio'] = [0.1, 0.3, 0.5, 0.7, 0.9]
param_grid['elasticnet__fit_intercept'] = [True, False]

In [93]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('elasticnet' , model)
])

In [94]:
grid_search = GridSearchCV(pipeline,
                           param_grid=param_grid, 
                           cv=cv,
                           scoring='neg_mean_squared_error',
                           refit=True,
                           return_train_score=True
                           ) 
grid_search.fit(X_train, y_train)
print("1. 학습모델 best_estimator_ : ", grid_search.best_estimator_)
print("2. 학습모델 best_params_ : ", grid_search.best_params_)
print("3. 학습모델 best_score_MSE : ", -1 * grid_search.best_score_)

1. 학습모델 best_estimator_ :  Pipeline(steps=[('scaler', StandardScaler()),
                ('elasticnet', ElasticNet(alpha=0.01, l1_ratio=0.3))])
2. 학습모델 best_params_ :  {'elasticnet__alpha': 0.01, 'elasticnet__fit_intercept': True, 'elasticnet__l1_ratio': 0.3}
3. 학습모델 best_score_MSE :  24.411763841592595


In [95]:
y_grid_pred = grid_search.predict(X_test)

In [96]:
MSE = mean_squared_error(y_test, y_grid_pred)
RMSE = np.sqrt(MSE)
print('3. 테스트 데이터 : MSE : {0:.5f}, RMSE : {1:.5f}'.format(MSE, RMSE))

3. 테스트 데이터 : MSE : 20.22417, RMSE : 4.49713
