## hold out

In [16]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
df = sns.load_dataset('tips')

In [17]:
y_col = 'tip'
X = df.drop(columns = [y_col])
# 標準化のために数値カラムを取得
numeric_cols = X.select_dtypes(include = np.number).columns.to_list()
X = pd.get_dummies(X, drop_first = True)
y = df[y_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [28]:
# 標準化
# 標準化はデータ分割後に行う
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# 数値カラムのみ標準化
X_train_scaled = X_train.copy()
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled = X_test.copy()
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [31]:
# 線形モデル学習
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [36]:
# モデルの評価 (MSE)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred) # np.mean(np.square(y_test- y_pred))

0.955080898861715

In [33]:
y_test

64     2.64
63     3.76
55     3.51
111    1.00
225    2.50
       ... 
90     3.00
101    3.00
75     1.25
4      3.61
109    4.00
Name: tip, Length: 74, dtype: float64

## LOOCV (Leave One Out Cross Validation)

In [38]:
from sklearn.model_selection import LeaveOneOut
# データ準備
X = df["total_bill"].values.reshape(-1,1)
y = df["tip"]

In [39]:
loo = LeaveOneOut()

In [43]:
model = LinearRegression()
mse_list = []
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # モデル学習
    model.fit(X_train, y_train)
    # テストデータの予測
    y_pred = model.predict(X_test)
    # モデル評価
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [45]:
print(f"MSE(LOOCV):{np.mean(mse_list)}")
print(f"std:{np.std(mse_list)}")

MSE(LOOCV):1.0675673489857438
std:2.0997944551776313


In [47]:
from sklearn.model_selection import cross_val_score
cv = LeaveOneOut()
scores = cross_val_score(model, X, y, cv = cv, scoring = 'neg_mean_squared_error')
print(f"MSE(LOOCV):{-np.mean(scores)}")
print(f"std:{np.std(scores)}")

MSE(LOOCV):1.0675673489857438
std:2.0997944551776313


## k-Fold CV

In [72]:
from sklearn.model_selection import KFold, RepeatedKFold
k = 5
n_repeats = 3
# cv = KFold(n_splits = k, shuffle = True, random_state = 0)
cv = RepeatedKFold(n_splits = k, n_repeats = n_repeats, random_state = 0)
model = LinearRegression()
mse_list = []
for train_index, test_index in cv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # 標準化をする場合はここでする
    
    # モデル学習
    model.fit(X_train, y_train)
    #テストデータ予測
    y_pred = model.predict(X_test)
    # モデル評価
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

In [73]:
print(f"MSE({k}FoldCV):{np.mean(mse_list)}")
print(f"std:{np.std(mse_list)}")

MSE(5FoldCV):1.0746387233165984
std:0.26517178540898434


In [74]:
scores = cross_val_score(model, X, y, cv = cv, scoring = 'neg_mean_squared_error')

In [75]:
print(f"MSE({k}FoldCV):{-np.mean(scores)}")
print(f"std:{np.std(scores)}")

MSE(5FoldCV):1.0746387233165984
std:0.26517178540898434


In [76]:
scores

array([-0.82130906, -1.07458421, -1.08801239, -1.33238677, -1.084763  ,
       -1.15878391, -1.6042084 , -1.03070862, -0.71202907, -0.84729854,
       -0.88561033, -1.52485216, -0.6332659 , -1.2003542 , -1.12141427])

## Pipeline

In [77]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps = [('scaler', StandardScaler()), ('model', LinearRegression())])

In [78]:
cv = KFold(n_splits = 5, shuffle = True, random_state = 0)
scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'neg_mean_squared_error')

In [79]:
scores

array([-0.82130906, -1.07458421, -1.08801239, -1.33238677, -1.084763  ])

In [83]:
## Pipelineなし
# 標準化＋線形回帰
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
model = LinearRegression()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [87]:
y_pred

array([2.71486884, 2.78639251, 2.90900452, 1.65836207, 2.57999564,
       1.50509707, 2.74858715, 3.30136293, 2.77208778, 4.45800284,
       3.50060744, 3.49345507, 2.35520697, 2.24587793, 2.28879213,
       4.02375199, 1.77075641, 2.3480546 , 2.83645908, 3.2778623 ,
       3.98901192, 3.05511716, 2.55240794, 2.45431834, 2.29798803,
       2.59327861, 2.16004953, 3.96244599, 3.50162921, 2.5289073 ,
       2.42264357, 2.19274606, 2.49314547, 1.99963215, 2.78639251,
       2.28572683, 2.64743224, 1.97306622, 5.85577969, 2.55036441,
       1.79425705, 2.18763723, 2.52073317, 3.96755482, 2.22135553,
       2.65151931, 2.78128368, 3.12255376, 2.66173698, 3.66409011,
       4.2567148 , 2.74552185, 3.01118119, 5.83943142, 1.89847725,
       2.14676656, 3.97572896, 3.03161652, 2.37462053, 2.21113786,
       3.70496078, 2.53299437, 3.07963956, 3.47199797, 3.99718606,
       2.5043849 , 2.60043097, 4.2720413 , 1.97306622, 3.87763935,
       2.4890584 , 1.99145802, 3.43010554, 2.37972937])

In [85]:
## Pipelineあり
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
pipeline = Pipeline(steps = [('scaler', StandardScaler()), ('model', LinearRegression())])
pipeline.fit(X_train, y_train)
y_pred_p = pipeline.predict(X_test)

In [86]:
y_pred_p

array([2.71486884, 2.78639251, 2.90900452, 1.65836207, 2.57999564,
       1.50509707, 2.74858715, 3.30136293, 2.77208778, 4.45800284,
       3.50060744, 3.49345507, 2.35520697, 2.24587793, 2.28879213,
       4.02375199, 1.77075641, 2.3480546 , 2.83645908, 3.2778623 ,
       3.98901192, 3.05511716, 2.55240794, 2.45431834, 2.29798803,
       2.59327861, 2.16004953, 3.96244599, 3.50162921, 2.5289073 ,
       2.42264357, 2.19274606, 2.49314547, 1.99963215, 2.78639251,
       2.28572683, 2.64743224, 1.97306622, 5.85577969, 2.55036441,
       1.79425705, 2.18763723, 2.52073317, 3.96755482, 2.22135553,
       2.65151931, 2.78128368, 3.12255376, 2.66173698, 3.66409011,
       4.2567148 , 2.74552185, 3.01118119, 5.83943142, 1.89847725,
       2.14676656, 3.97572896, 3.03161652, 2.37462053, 2.21113786,
       3.70496078, 2.53299437, 3.07963956, 3.47199797, 3.99718606,
       2.5043849 , 2.60043097, 4.2720413 , 1.97306622, 3.87763935,
       2.4890584 , 1.99145802, 3.43010554, 2.37972937])