In [1]:
# ライブラリのインポート
#%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# 住宅価格データセットの読み込み
df=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data', header=None, sep='\s+')

df.columns=['CRIM','ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

# データフレームの形状
print('dfの形状', df.shape)

  df=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data', header=None, sep='\s+')


dfの形状 (506, 14)


In [3]:
# 全ての特徴量を選択
X=df.iloc[:, 0:13].values
# 正解に住宅価格(MDEV)を設定
y = df['MEDV'].values

# 特徴量と正解を訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print('X_trainの形状：',X_train.shape,' y_trainの形状：',y_train.shape,' X_testの形状：',X_test.shape,' y_testの形状：',y_test.shape)

X_trainの形状： (404, 13)  y_trainの形状： (404,)  X_testの形状： (102, 13)  y_testの形状： (102,)


In [4]:
# 特徴量を2次多項式に変換
POLY = PolynomialFeatures(degree=2, include_bias = False)

X_train_pol = POLY.fit_transform(X_train)
X_test_pol = POLY.transform(X_test)
X_train_pol.shape, X_test_pol.shape

((404, 104), (102, 104))

In [5]:
# 特徴量の標準化
sc = StandardScaler()
# 訓練データを変換器で標準化
X_train_std = sc.fit_transform(X_train_pol)
# テストデータを作成した変換器で標準化
X_test_std = sc.transform(X_test_pol)

In [6]:
# 正則化無しとL1正則化のモデルを作成
model = LinearRegression()
model2 = Lasso(alpha=0.1)

# モデルの訓練
model.fit(X_train_std, y_train)
model2.fit(X_train_std, y_train)

In [7]:
# 正則化無しの傾きと切片
print(model.intercept_) 
print(model.coef_.shape)
print(model.coef_)

22.611881188119
(104,)
[-1.73238190e+01  5.57338146e+00 -4.13882888e+01  4.23848940e+00
  4.11601424e+00  5.43841210e+00  3.14362235e+01 -2.49746252e+01
  1.26131503e+01  8.46984514e+00  8.70501997e+00 -1.02385225e+01
 -1.00437829e+01  7.49340538e-01  6.03265356e-01  9.30886117e+01
  1.03782088e+00 -1.63308876e+00  4.51534131e+00 -3.15456518e+00
  1.17984392e-01  1.17567940e+02 -2.51251952e+02  5.39960257e+01
 -1.11244473e-01  2.58276185e+00 -2.32841294e+00 -5.18666792e-01
 -3.54721300e-01 -1.65608607e+01  2.81941405e+00  3.35383652e-02
 -5.81334892e-01 -2.87674801e-01  4.78801041e+00 -5.16168764e+00
  1.34093216e+01 -6.33107807e-01  9.44367683e+00  6.90675278e-01
  7.27080180e-02  1.60511042e+01  5.19002722e+00  2.56668287e+00
 -1.27338170e+01  7.29286601e+00 -5.56744148e+00  1.32188945e+01
 -3.06558318e+00  4.23848940e+00 -5.26449706e+00 -1.01815811e+01
  5.65967555e-01  1.81819433e-01  1.96816854e+00 -2.07646183e+00
 -4.55665998e+00  1.14434016e+01 -1.30255041e+00 -1.45182921e+00
  

In [8]:
# L1正則化の傾きと切片
print(model2.intercept_) 
print(model2.coef_.shape)
print(model2.coef_)

22.611881188118844
(104,)
[-0.          0.          0.          0.         -0.          0.
  0.         -0.          0.          0.         -0.          0.
 -0.          0.          0.24633868 -0.          0.56721098 -0.26764504
 -0.71696315 -0.         -0.         -0.         -0.         -0.
 -0.         -0.          0.48958407 -0.46755873  0.11857147  0.
  0.00713714  0.          0.          0.         -0.          0.
  0.         -0.07437602  0.06232768 -0.         -0.         -0.
  0.         -0.11989191  0.43944794  0.          0.          0.
 -0.          0.         -0.          0.          0.          0.09828279
  0.          0.          0.          0.         -0.         -0.
 -1.21751701 -0.         -0.98619835  0.         -0.         -0.
  0.         -0.          5.87856207 -0.         -0.         -0.
 -0.         -1.92687309  0.         -5.22072694  0.         -0.
  0.          0.          0.          0.         -0.         -0.
  0.23376925 -0.66826934 -0.         -0.        

In [9]:
# 正則化無しのMSE
y_train_pred = model.predict(X_train_std)
y_test_pred = model.predict(X_test_std)

print('MSE train: %.2f, test: %.2f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))

MSE train: 4.34, test: 31.28


In [10]:
# L1正則化有りのMSE
y_train_pred = model2.predict(X_train_std)
y_test_pred = model2.predict(X_test_std)

print('MSE train: %.2f, test: %.2f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))

MSE train: 11.93, test: 23.94


In [11]:
# L1+L2正則化のモデルを作成
model3 = ElasticNet(alpha=0.1, l1_ratio=0.6)

# モデルの訓練
model3.fit(X_train_std, y_train)

In [12]:
# L1+L2正則化の傾きと切片
print(model3.intercept_) 
print(model3.coef_.shape)
print(model3.coef_)

22.61188118811884
(104,)
[-0.          0.          0.          0.         -0.          0.96962931
  0.         -0.17794494  0.40611044  0.         -0.25150181  0.
 -0.          0.          0.28264552 -0.          0.66207521 -0.14156505
 -0.74045238 -0.         -0.         -0.         -0.         -0.
 -0.13177096 -0.          0.44487677 -0.47442068  0.13078156  0.
  0.65679043  0.          0.          0.         -0.          0.
  0.         -0.6322106   0.48869742 -0.          0.         -0.
  0.         -0.50407814  0.45846895  0.21648547  0.          0.
 -0.1530772   0.         -0.04160027  0.          0.          0.19218329
 -0.          0.          0.          0.         -0.04486905 -0.35021268
 -0.38826966 -0.         -0.69870717  0.         -0.         -0.56038388
  0.         -0.80426447  3.36988849 -0.         -0.         -0.
 -0.94551835 -1.56053816  1.08914961 -2.87403334  0.         -0.
  0.          0.         -0.          0.         -0.45243079 -0.
  0.2389787  -0.50433443 

In [13]:
# L1+L2正則化有りのMSE
y_train_pred = model3.predict(X_train_std)
y_test_pred = model3.predict(X_test_std)

print('MSE train: %.2f, test: %.2f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))

MSE train: 12.16, test: 25.06
