# 【問題1】train_test_splitのスクラッチ

スクラッチの練習として、scikit-learnのtrain_test_splitを自作してみます。  
以下の雛形をベースとして関数を完成させてください。  
sklearn.model_selection.train_test_split — scikit-learn 0.21.3 documentation  
なお、作成した関数がscikit-learnのtrain_test_splitと同じ動作をしているか必ず確認をするようにしましょう。

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.utils import check_random_state

In [2]:
df = pd.read_csv('application_train.csv')
df_test = pd.read_csv('application_test.csv')
X = df_test

In [3]:
iris = datasets.load_iris()
X = iris.data[50:, :]
y = iris.target[50:]
print(X.shape)
print(y.shape)

(100, 4)
(100,)


In [4]:
def scratch_train_test_split(X, y, train_size=0.8, random_state=0):
    """
    検証データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      訓練データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    num_list = len(X)
    rgen = check_random_state(random_state)
    permutation = rgen.permutation(num_list)
    
    n_train = int(num_list * train_size)
    ind_train = permutation[:n_train]
    ind_test = permutation[n_train:]
    
    X_train = X[ind_train, :]
    y_train = y[ind_train]
    
    X_test = X[ind_test, :]
    y_test = y[ind_test]
    
    return X_train, X_test, y_train, y_test

In [5]:
# スクラッチで実装したscratch_train_test_splitによりデータ分割
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.8, random_state=0)
print(X_train[:5])
print(X_test[:5])
print(y_train[:5])
print(y_test[:5])

[[6.8 2.8 4.8 1.4]
 [6.3 3.4 5.6 2.4]
 [6.9 3.1 4.9 1.5]
 [7.6 3.  6.6 2.1]
 [7.2 3.2 6.  1.8]]
[[6.6 3.  4.4 1.4]
 [6.3 2.3 4.4 1.3]
 [7.9 3.8 6.4 2. ]
 [5.7 2.9 4.2 1.3]
 [5.5 2.5 4.  1.3]]
[1 2 1 2 2]
[1 1 2 1 1]


In [6]:
# scikit-learnのクラスを用いてデータ分割
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)
print(X_train[:5])
print(X_test[:5])
print(y_train[:5])
print(y_test[:5])

[[5.  2.3 3.3 1. ]
 [6.8 3.  5.5 2.1]
 [5.5 2.3 4.  1.3]
 [5.6 2.8 4.9 2. ]
 [5.7 3.  4.2 1.2]]
[[6.8 2.8 4.8 1.4]
 [6.3 3.4 5.6 2.4]
 [6.9 3.1 4.9 1.5]
 [7.6 3.  6.6 2.1]
 [7.2 3.2 6.  1.8]]
[1 2 1 2 1]
[1 2 1 2 2]


# 【問題2】 分類問題を解くコードの作成

上記3種類の手法で3種類のデータセットを学習・推定するコードを作成してください。

In [7]:
from sklearn.datasets import load_iris

In [8]:
# データセット1を準備
data = load_iris()
X1 = data.data[data.target > 0]
y1 = data.target[data.target > 0]

In [9]:
# データセット2を準備
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))
X2 = np.concatenate((f0, f1))
y2 = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)
random_index = np.random.permutation(np.arange(n_samples))
X2 = X2[random_index]
y2 = y2[random_index]

In [10]:
# データセット3を準備
X3 = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y3 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [11]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [12]:
# データセット1, 2, 3を訓練データ、テストデータに分割
X_train1, X_test1, y_train1, y_test1 = scratch_train_test_split(X1, y1, train_size=0.8)
X_train2, X_test2, y_train2, y_test2 = scratch_train_test_split(X2, y2, train_size=0.8)
X_train3, X_test3, y_train3, y_test3 = scratch_train_test_split(X3, y3, train_size=0.8)

- ロジスティック回帰

In [13]:
# データセット1
lr = SGDClassifier(loss="log", random_state=42)
lr.fit(X_train1, y_train1)
pred11 = lr.predict(X_test1)
print(accuracy_score(y_test1, pred11))

1.0


In [14]:
# データセット2
lr = SGDClassifier(loss="log", random_state=42)
lr.fit(X_train2, y_train2)
pred12 = lr.predict(X_test2)
print(accuracy_score(y_test2, pred12))

1.0


In [15]:
# データセット3
lr = SGDClassifier(loss="log", random_state=42)
lr.fit(X_train3, y_train3)
pred13 = lr.predict(X_test3)
print(accuracy_score(y_test3, pred13))

0.875


- SVM

In [16]:
# データセット1
svc = SVC(random_state=42)
svc.fit(X_train1, y_train1)
pred21 = svc.predict(X_test1)
print(accuracy_score(y_test1, pred21))

1.0




In [17]:
# データセット2
svc = SVC(random_state=42)
svc.fit(X_train2, y_train2)
pred22 = svc.predict(X_test2)
print(accuracy_score(y_test2, pred22))

1.0




In [18]:
# データセット3
svc = SVC(random_state=42)
svc.fit(X_train3, y_train3)
pred23 = svc.predict(X_test3)
print(accuracy_score(y_test3, pred23))

0.75




- 決定木

In [19]:
# データセット2
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train1, y_train1)
pred31 = tree.predict(X_test1)
print(accuracy_score(y_test1, pred31))

0.9


In [20]:
# データセット2
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train2, y_train2)
pred32 = tree.predict(X_test2)
print(accuracy_score(y_test2, pred32))

1.0


In [21]:
# データセット3
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train3, y_train3)
pred33 = tree.predict(X_test3)
print(accuracy_score(y_test3, pred33))

1.0


# 【問題3】 回帰問題を解くコードの作成

In [22]:
df = pd.read_csv('train.csv')

In [23]:
X4 = df.drop('SalePrice', axis=1)
X4 = X4[['GrLivArea', 'YearBuilt']].values
y4 = df['SalePrice'].values

In [24]:
X_train4, X_test4, y_train4, y_test4 = scratch_train_test_split(X4, y4, train_size=0.8)

In [25]:
from sklearn.linear_model import SGDRegressor

In [26]:
lr = SGDRegressor(random_state=42)
lr.fit(X_train4, y_train4)
pred4 = lr.predict(X_test4)
print(pred4)

[-5.11080879e+14 -1.24759199e+15 -1.56306537e+15  1.84504628e+14
 -1.69449599e+15 -1.52025464e+15 -3.87094201e+14 -1.45413452e+15
 -7.94556774e+14 -1.30884465e+15 -6.47378967e+14 -1.60847008e+15
 -1.56181911e+15 -1.20838498e+15 -6.59251630e+14 -1.55809714e+15
 -1.56380641e+15 -1.03905908e+14 -2.40978619e+14 -8.94964659e+14
 -4.91830744e+14 -1.52931501e+15 -9.73379394e+14 -5.19906132e+14
 -1.34540660e+15 -1.68713672e+15 -1.19737020e+15 -8.77449907e+14
 -1.00605074e+15 -7.93124382e+14 -1.35386099e+15 -7.65083283e+14
 -1.61469475e+14 -1.61630168e+15 -1.40702942e+15 -5.99127674e+14
 -2.42408905e+14 -1.53707944e+15 -9.22618797e+14 -9.06804437e+14
  3.01586870e+14 -1.03353694e+15 -6.65854478e+14 -4.30645967e+14
 -1.31435098e+15 -7.88425558e+14 -1.39434785e+15 -1.38932854e+15
 -1.39385942e+15 -9.10879783e+14 -1.24509878e+15 -1.65926356e+15
 -9.41093607e+14  7.01907085e+14 -5.93182595e+14 -1.39500421e+15
 -6.78855840e+14 -1.47688761e+15 -1.48160323e+15 -7.52216974e+14
 -1.62810717e+15 -6.65449