## Multiple Linear Regression

### 데이터 불러오기

In [20]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd

In [22]:
# 당뇨병 데이터 갖고 오기
diabetes_dataset = datasets.load_diabetes()

### 데이터 정보

In [23]:
print(diabetes_dataset.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, T-Cells (a type of white blood cells)
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, thyroid stimulating hormone
      - s5      ltg, lamotrigine
      - s6      glu, blood sugar level

Note: Each of these 10 feature va

### 데이터 컬럼명 가져오기
* feature_names

In [24]:
diabetes_dataset.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

### 데이터만 가져오기
* .data

In [25]:
diabetes_dataset.data

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]])

In [26]:
# 입력 변수 pandas dataframe으로 변환
X = pd.DataFrame(diabetes_dataset.data, columns=diabetes_dataset.feature_names)

In [27]:
X

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930


### 목표 변수
#### ['diabetes']
* .target

In [28]:
diabetes_dataset.target

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [29]:
# 목표 변수를 사용하기 편하게 pandas dataframe으로 변환
y = pd.DataFrame(diabetes_dataset.target, columns=['diabetes'])
y

Unnamed: 0,diabetes
0,151.0
1,75.0
2,141.0
3,206.0
4,135.0
...,...
437,178.0
438,104.0
439,132.0
440,220.0


### 데이터 분리

In [30]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=5)

In [31]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(353, 10) (89, 10) (353, 1) (89, 1)


#### train data

In [33]:
display(x_train,x_test)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
331,0.081666,0.050680,-0.025607,-0.036656,-0.070367,-0.046407,-0.039719,-0.002592,-0.041180,-0.005220
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
60,-0.070900,-0.044642,-0.004050,-0.040099,-0.066239,-0.078662,0.052322,-0.076395,-0.051401,-0.034215
300,0.016281,-0.044642,0.073552,-0.041247,-0.004321,-0.013527,-0.013948,-0.001116,0.042896,0.044485
351,-0.085430,0.050680,-0.040696,-0.033214,-0.081374,-0.069580,-0.006584,-0.039493,-0.057800,-0.042499
...,...,...,...,...,...,...,...,...,...,...
400,-0.023677,-0.044642,0.045529,0.090730,-0.018080,-0.035447,0.070730,-0.039493,-0.034524,-0.009362
118,-0.056370,0.050680,-0.010517,0.025315,0.023198,0.040022,-0.039719,0.034309,0.020612,0.056912
189,-0.001882,-0.044642,-0.066563,0.001215,-0.002945,0.003070,0.011824,-0.002592,-0.020289,-0.025930
206,0.001751,0.050680,0.026128,-0.009113,0.024574,0.038456,-0.021311,0.034309,0.009436,0.003064


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
360,0.016281,-0.044642,0.020739,0.021872,-0.013953,-0.013214,-0.006584,-0.002592,0.013316,0.040343
290,0.059871,0.050680,0.076786,0.025315,0.001183,0.016849,-0.054446,0.034309,0.029936,0.044485
191,-0.005515,0.050680,-0.041774,-0.043542,-0.079998,-0.076156,-0.032356,-0.039493,0.010226,-0.009362
50,0.034443,-0.044642,-0.007284,0.014987,-0.044223,-0.037326,-0.002903,-0.039493,-0.021394,0.007207
186,-0.081798,0.050680,0.042296,-0.019442,0.039710,0.057558,-0.069172,0.108111,0.047186,-0.038357
...,...,...,...,...,...,...,...,...,...,...
309,-0.009147,0.050680,0.001339,-0.002228,0.079612,0.070084,0.033914,-0.002592,0.026714,0.081764
321,0.096197,-0.044642,0.051996,0.079254,0.054845,0.036577,-0.076536,0.141322,0.098646,0.061054
348,0.030811,-0.044642,-0.020218,-0.005671,-0.004321,-0.029497,0.078093,-0.039493,-0.010904,-0.001078
101,0.016281,0.050680,-0.045007,0.063187,0.010815,-0.000374,0.063367,-0.039493,-0.030751,0.036201


#### test data

In [34]:
display(y_train,y_test)

Unnamed: 0,diabetes
331,199.0
0,151.0
60,61.0
300,275.0
351,71.0
...,...
400,175.0
118,179.0
189,79.0
206,196.0


Unnamed: 0,diabetes
360,281.0
290,332.0
191,178.0
50,155.0
186,137.0
...,...
309,142.0
321,230.0
348,148.0
101,102.0


### 모델 적용 및 학습

In [36]:
model = LinearRegression()

In [37]:
model.fit(x_train, y_train)

LinearRegression()

### theta

순서대로 theta 1, 2, 3, 4, ...

In [38]:
model.coef_

array([[   2.72195846, -255.94592688,  522.83461574,  353.10273364,
        -827.58494078,  543.32591808,  115.93459912,  214.68877404,
         694.94194778,   32.73088487]])

### theta 0 
#### `intercept_` 손실을 최대한 적게 하는 값

In [39]:
model.intercept_

array([152.22183645])

### 모델 평가

#### `test set`의 예측 값 구하기

In [42]:
y_test_prediction = model.predict(x_test)
y_test_prediction[:10]

array([[195.88049677],
       [219.9869973 ],
       [121.41731446],
       [158.096774  ],
       [199.41301643],
       [128.03683528],
       [100.64085969],
       [235.20267685],
       [178.49979907],
       [214.01527918]])

#### 예측 값(`y_test_prediction`)과 실제 값(output = `y_test`)을 비교

### RMSE
평균 제곱 오차의 루트를 통해서 테스트 데이터에서의 모델 성능 판단

In [46]:
mse = mean_squared_error(y_test, y_test_prediction)

In [47]:
rmse =  mse ** 0.5
rmse

54.603896119844435

---

### 코드 정리

In [48]:
# 필요한 라이브러리 import
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd  

# 당뇨병 데이터 갖고 오기
diabetes_dataset = datasets.load_diabetes()

# 입력 변수를 사용하기 편하게 pandas dataframe으로 변환
X = pd.DataFrame(diabetes_dataset.data, columns=diabetes_dataset.feature_names)

# 목표 변수를 사용하기 편하게 pandas dataframe으로 변환
y = pd.DataFrame(diabetes_dataset.target, columns=['diabetes'])

# train_test_split를 사용해서 주어진 데이터를 학습, 테스트 데이터로 나눈다
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=5)

linear_regression_model = LinearRegression()  # 선형 회귀 모델을 가지고 오고 
linear_regression_model.fit(X_train, y_train)  # 학습 데이터를 이용해서 모델을 학습 시킨다

y_test_predict = linear_regression_model.predict(X_test)  # 학습시킨 모델로 예측

# 평균 제곱 오차의 루트를 통해서 테스트 데이터에서의 모델 성능 판단
mse = mean_squared_error(y_test, y_test_predict)

mse ** 0.5

54.603896119844435