# 【問題1】train_test_splitのスクラッチ

In [1]:
def scratch_train_test_split(X, y, train_size=0.8, random_seed=None, shuffle=False):
    """
    検証データを分割する。
    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定
      浮動小数点数は四捨五入
    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      訓練データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    import numpy as np
    
    if X is not None or y is not None:

        if shuffle is True:
            np.random.seed(random_seed)
            np.random.shuffle(X)
            np.random.shuffle(y)
        else:
            pass
        
        X_train, X_test = np.split(X, [round(len(X)* train_size)])
        y_train, y_test = np.split(y, [round(len(y)*train_size)])

    else:
        raise ValueError("At least one array required as input")
         
    return X_train, X_test, y_train, y_test

In [2]:
import numpy as np

X = np.arange(21).reshape((-1, 3))
y = np.arange(7)
train_size=0.7
random_seed=1
shuffle=True

X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=train_size, random_seed=1, shuffle=shuffle)
X_train, X_test, y_train, y_test

(array([[18, 19, 20],
        [ 6,  7,  8],
        [ 3,  4,  5],
        [ 0,  1,  2],
        [12, 13, 14]]),
 array([[ 9, 10, 11],
        [15, 16, 17]]),
 array([2, 3, 4, 1, 6]),
 array([0, 5]))

# 【問題2】 分類問題を解くコードの作成

## データセット

### 1. iris (virgicolorとvirginica)

In [3]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()

df = pd.DataFrame(iris.data, 
                             columns=iris.feature_names,
                            )

df["target"] = iris.target_names[iris.target]
df = df[df["target"]!="setosa"]

X_iris = df.iloc[:, :4].values
y_iris = df.iloc[:, 4].values

In [4]:
X_iris.shape, y_iris.shape

((100, 4), (100,))

### 2. シンプルデータセット1

In [5]:
import numpy as np

np.random.seed(seed=0)

n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]

f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))

X_simple1 = np.concatenate((f0, f1))
y_simple1 = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)

random_index = np.random.permutation(np.arange(n_samples))

X_simple1 = X_simple1[random_index]
y_simple1 = y_simple1[random_index]

In [6]:
X_simple1.shape, y_simple1.shape

((500, 2), (500,))

### 3. シンプルデータセット2

In [7]:
X_simple2 = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y_simple2 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [8]:
X_simple2.shape, y_simple2.shape

((40, 2), (40,))

## 2-1. ロジスティック回帰 (SGDClassifier)

In [9]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

sgd_clf = SGDClassifier(loss="log")

### 2-1-1. iris

In [10]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X_iris, y_iris)
sgd_clf.fit(X_train, y_train)
y_pred = sgd_clf.predict(X_test)
print("予測：", y_pred)
print("正解：", y_test)
print("正解率：", accuracy_score(y_test, y_pred))

予測： ['virginica' 'virginica' 'virginica' 'versicolor' 'virginica' 'virginica'
 'virginica' 'virginica' 'versicolor' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica']
正解： ['virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica']
正解率： 0.9


### 2-1-2. シンプルデータセット1

In [11]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X_simple1, y_simple1)
sgd_clf.fit(X_train, y_train)
y_pred = sgd_clf.predict(X_test)
print("予測：", y_pred)
print("正解：", y_test)
print("正解率：", accuracy_score(y_test, y_pred))

予測： [-1 -1  1 -1  1  1 -1  1 -1  1 -1  1  1 -1  1  1  1  1 -1  1 -1 -1 -1 -1
  1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1 -1  1 -1  1  1  1 -1  1 -1 -1  1
  1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1  1 -1 -1  1  1 -1  1 -1 -1  1 -1  1  1
 -1 -1  1  1 -1  1 -1 -1  1 -1  1 -1  1 -1 -1 -1 -1 -1  1  1 -1 -1 -1 -1
  1  1  1 -1]
正解： [-1 -1  1 -1  1  1 -1  1 -1  1 -1  1  1 -1  1  1  1  1 -1  1 -1 -1 -1 -1
  1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1 -1  1 -1  1  1  1 -1  1 -1 -1  1
  1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1  1 -1 -1  1  1 -1  1 -1 -1  1 -1  1  1
 -1 -1  1  1 -1  1 -1 -1  1 -1  1 -1  1 -1 -1 -1 -1 -1  1  1 -1 -1 -1 -1
  1  1  1 -1]
正解率： 1.0


### 2-1-3. シンプルデータセット2

In [12]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X_simple2, y_simple2)
sgd_clf.fit(X_train, y_train)
y_pred = sgd_clf.predict(X_test)
print("予測：", y_pred)
print("正解：", y_test)
print("正解率：", accuracy_score(y_test, y_pred))

予測： [0 0 1 0 1 1 1 1]
正解： [1 1 1 1 1 1 1 1]
正解率： 0.625


## 2-2. SVM (SVC)

In [13]:
from sklearn.svm import SVC

svm_clf = SVC()

### 2-2-1. iris

In [14]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X_iris, y_iris)
svm_clf.fit(X_train, y_train)
y_pred = svm_clf.predict(X_test)
print("予測：", y_pred)
print("正解：", y_test)
print("正解率：", accuracy_score(y_test, y_pred))

予測： ['virginica' 'virginica' 'virginica' 'versicolor' 'virginica' 'virginica'
 'virginica' 'virginica' 'versicolor' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica']
正解： ['virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica']
正解率： 0.9


### 2-2-2. シンプルデータセット1

In [15]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X_simple1, y_simple1)
svm_clf.fit(X_train, y_train)
y_pred = svm_clf.predict(X_test)
print("予測：", y_pred)
print("正解：", y_test)
print("正解率：", accuracy_score(y_test, y_pred))

予測： [-1 -1  1 -1  1  1 -1  1 -1  1 -1  1  1 -1  1  1  1  1 -1  1 -1 -1 -1 -1
  1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1 -1  1 -1  1  1  1 -1  1 -1 -1  1
  1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1  1 -1 -1  1  1 -1  1 -1 -1  1 -1  1  1
 -1 -1  1  1 -1  1 -1 -1  1 -1  1 -1  1 -1 -1 -1 -1 -1  1  1 -1 -1 -1 -1
  1  1  1 -1]
正解： [-1 -1  1 -1  1  1 -1  1 -1  1 -1  1  1 -1  1  1  1  1 -1  1 -1 -1 -1 -1
  1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1 -1  1 -1  1  1  1 -1  1 -1 -1  1
  1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1  1 -1 -1  1  1 -1  1 -1 -1  1 -1  1  1
 -1 -1  1  1 -1  1 -1 -1  1 -1  1 -1  1 -1 -1 -1 -1 -1  1  1 -1 -1 -1 -1
  1  1  1 -1]
正解率： 1.0


### 2-2-3. シンプルデータセット2

In [16]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X_simple2, y_simple2)
svm_clf.fit(X_train, y_train)
y_pred = svm_clf.predict(X_test)
print("予測：", y_pred)
print("正解：", y_test)
print("正解率：", accuracy_score(y_test, y_pred))

予測： [0 0 0 0 0 1 1 0]
正解： [1 1 1 1 1 1 1 1]
正解率： 0.25


## 2-3. 決定木

In [17]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()

### 2-3-1. iris

In [18]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X_iris, y_iris)
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)
print("予測：", y_pred)
print("正解：", y_test)
print("正解率：", accuracy_score(y_test, y_pred))

予測： ['virginica' 'virginica' 'virginica' 'versicolor' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica']
正解： ['virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica']
正解率： 0.95


### 2-3-2. シンプルデータセット1

In [19]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X_simple1, y_simple1)
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)
print("予測：", y_pred)
print("正解：", y_test)
print("正解率：", accuracy_score(y_test, y_pred))

予測： [-1 -1  1 -1  1  1 -1  1 -1  1 -1  1  1 -1  1  1  1  1 -1  1 -1 -1 -1 -1
  1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1 -1  1 -1  1  1  1 -1  1 -1 -1  1
  1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1  1 -1 -1  1  1 -1  1 -1 -1  1 -1  1  1
 -1 -1  1  1 -1  1 -1 -1  1 -1  1 -1  1 -1 -1 -1 -1 -1  1  1 -1 -1 -1 -1
  1  1  1 -1]
正解： [-1 -1  1 -1  1  1 -1  1 -1  1 -1  1  1 -1  1  1  1  1 -1  1 -1 -1 -1 -1
  1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1 -1  1 -1  1  1  1 -1  1 -1 -1  1
  1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1  1 -1 -1  1  1 -1  1 -1 -1  1 -1  1  1
 -1 -1  1  1 -1  1 -1 -1  1 -1  1 -1  1 -1 -1 -1 -1 -1  1  1 -1 -1 -1 -1
  1  1  1 -1]
正解率： 1.0


### 2-3-3. シンプルデータセット2

In [20]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X_simple2, y_simple2)
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)
print("予測：", y_pred)
print("正解：", y_test)
print("正解率：", accuracy_score(y_test, y_pred))

予測： [1 0 0 1 0 1 0 1]
正解： [1 1 1 1 1 1 1 1]
正解率： 0.5


# 【問題3】 回帰問題を解くコードの作成

## データセット

In [21]:
df = pd.read_csv("house-prices-advanced-regression-techniques/train.csv")
X = df.loc[:, ["GrLivArea", "YearBuilt"]].values
y = df.iloc[:, -1].values

In [22]:
X.shape, y.shape

((1460, 2), (1460,))

## 線形回帰 (SGDRegressor)

In [23]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

sgd_reg = SGDRegressor()

X_train, X_test, y_train, y_test = scratch_train_test_split(X, y)
sgd_reg.fit(X_train, y_train)
y_pred = sgd_reg.predict(X_test)

print("平均二乗誤差：", mean_squared_error(y_test, y_pred))

平均二乗誤差： 2.633985270825375e+30
