## 練習時間
試著使用 sklearn datasets 的其他資料集 (boston, ...)，來訓練自己的線性迴歸模型，並加上適當的正則話來觀察訓練情形。

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
plt.style.use("fivethirtyeight")
from sklearn.linear_model import Ridge, Lasso

In [2]:
boston = datasets.load_boston()
df_boston = pd.DataFrame(boston.data, columns = boston.feature_names)
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [3]:
df_boston.shape

(506, 13)

In [4]:
df_boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB


In [5]:
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.2, random_state = 42)

In [6]:
linear = linear_model.LinearRegression()
linear.fit(x_train, y_train)
y_pred = linear.predict(x_test)
print(linear.coef_)
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

[-1.12463481e-01  3.00810168e-02  4.07309919e-02  2.78676719e+00
 -1.72406347e+01  4.43248784e+00 -6.23998173e-03 -1.44848504e+00
  2.62113793e-01 -1.06390978e-02 -9.16398679e-01  1.24516469e-02
 -5.09349120e-01]
Mean squared error: 24.31


In [7]:
lasso = Lasso(alpha = 0.1)
lasso.fit(x_train, y_train)
y_pred = lasso.predict(x_test)

### LASSO

In [8]:
lasso.coef_

array([-0.10328188,  0.03486297, -0.01659083,  0.92201803, -0.        ,
        4.30629662, -0.01510459, -1.15137625,  0.23870028, -0.01295797,
       -0.73270265,  0.01318656, -0.56548792])

In [9]:
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 25.18


改變alpha值觀察模型是否有改變

In [10]:
lasso = Lasso(alpha = 0.2)
lasso.fit(x_train, y_train)
y_pred = lasso.predict(x_test)

In [11]:
lasso.coef_

array([-0.10224582,  0.03482293, -0.00517342,  0.        , -0.        ,
        4.02157589, -0.010271  , -1.09482889,  0.24482477, -0.01331406,
       -0.74219737,  0.0131336 , -0.59131687])

In [12]:
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 25.06


### RIDGE

In [13]:
wine = datasets.load_wine()
df_wine = pd.DataFrame(wine.data, columns = wine.feature_names)
df_wine.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [14]:
df_wine.shape

(178, 13)

In [15]:
df_wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
alcohol                         178 non-null float64
malic_acid                      178 non-null float64
ash                             178 non-null float64
alcalinity_of_ash               178 non-null float64
magnesium                       178 non-null float64
total_phenols                   178 non-null float64
flavanoids                      178 non-null float64
nonflavanoid_phenols            178 non-null float64
proanthocyanins                 178 non-null float64
color_intensity                 178 non-null float64
hue                             178 non-null float64
od280/od315_of_diluted_wines    178 non-null float64
proline                         178 non-null float64
dtypes: float64(13)
memory usage: 18.2 KB


In [16]:
x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size = 0.2, random_state = 42)

In [17]:
ridge = Ridge(alpha = 0.1)
ridge.fit(x_train, y_train)
y_pred = ridge.predict(x_test)

In [18]:
ridge.coef_

array([-1.09154529e-01,  4.31718988e-02, -2.05036909e-01,  4.38166801e-02,
        7.91335307e-05,  1.49640862e-01, -3.51209877e-01, -2.97833423e-01,
        1.71794662e-02,  7.69204582e-02, -9.18322850e-02, -2.79912046e-01,
       -6.98778612e-04])

In [19]:
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 0.07


改變alpha值觀察模型準確度的改變

In [20]:
ridge = Ridge(alpha = 0.2)
ridge.fit(x_train, y_train)
y_pred = ridge.predict(x_test)

In [21]:
ridge.coef_

array([-1.09041614e-01,  4.30236520e-02, -2.04680058e-01,  4.36112234e-02,
        1.14183599e-04,  1.47155359e-01, -3.49238084e-01, -2.78460972e-01,
        1.68054474e-02,  7.70326636e-02, -9.19976574e-02, -2.78399236e-01,
       -7.00335543e-04])

In [22]:
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Mean squared error: 0.07
