In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# データセットの読み込み
from sklearn.datasets import load_boston
dataset = load_boston()
x, t = dataset.data, dataset.target
columns = dataset.feature_names

In [3]:
type(x), x.shape

(numpy.ndarray, (506, 13))

In [4]:
type(t), t.shape

(numpy.ndarray, (506,))

In [5]:
#Pandas の DataFrame に変換
df = pd.DataFrame(x, columns=columns)
df.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1


In [6]:
# 目標値を追加
df['Target'] = t
df.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1,18.9


In [7]:
#データフレーム内の入力変数と目的変数の切り分けも行います。データフレーム内の Target が目的変数に該当し、それ以外が入力変数に該当。
#drop() を使用し、引数に設定した任意の列、行を削除。
#labels : （行、列）ラベルを指定
#axis：行方向 (axis=0) または列方向 (axis=1) を指定

# 入力変数と目的変数の切り分け
t = df['Target'].values
x = df.drop(labels=['Target'], axis=1).values

In [8]:
"""
学習用データセットとテスト用データセットに分割
ホールドアウト法 (holdout method)
scikit-learn では、データセットから指定された割合のデータをランダムに抽出して学習用データセットを作成
残りをテスト用データセットに
sklearn.model_selection.train_test_split()
"""

from sklearn.model_selection import train_test_split

In [9]:
"""
2つに分割
テスト用データセットを全体の 30% のデータを用いて作成
random_state という引数に毎回同じ整数を与えることで、乱数のシード値を固定する
"""
x_train, x_test, t_train, t_test = train_test_split(x, t, test_size=0.3, random_state=0)

In [11]:
'''
Step 1：モデルの定義
重回帰分析を行う場合、LinearRegressionクラスを使用
'''
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [12]:
'''
Step 2：モデルの学習
model を用いて学習を実行するには、fit() の引数に入力値 x と目標値 t を与える
'''

model.fit(x_train, t_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)