## ホ16.3 k分割交差検証の前半

### データを読み込む

In [None]:
import polars as pl

housing = pl.read_csv("../data/housing_renamed.csv")
# ダミー変数の調整用にソートする
housing = housing.sort(by = ["units", "sq_ft", "boro"])
display(housing.head())
display(housing.columns)

### 訓練データと検証データに分割する

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from patsy import dmatrices

# 分割数の設定
kf = KFold(n_splits = 5)

# 分割用の行列の初期化
y, X = dmatrices("value_per_sq_ft ~ units + sq_ft + boro", housing)

### 学習する

In [None]:
from sklearn.linear_model import LinearRegression

coefs = []
scores = []
for train, eval in kf.split(X):
    X_train, X_eval = X[train], X[eval]
    y_train, y_eval = y[train], y[eval]
    lr = LinearRegression().fit(X_train, y_train)
    coefs.append(pl.DataFrame(lr.coef_))
    scores.append(lr.score(X_eval, y_eval))

### 学習結果を確認する

In [None]:
# 係数
coefs_df = pl.concat(items = coefs)
coefs_df.columns = X.design_info.column_names
display(coefs_df)

# 評価関数
display(scores)

### 交差検証のスコアを計算する短いコード

In [None]:
model = LinearRegression()
scores = cross_val_score(model, X, y, cv = 5)
display(scores)

### モデル別のスコアを比較する

In [None]:
# リンク関数
f1 = "value_per_sq_ft ~ units + sq_ft + boro"
f2 = "value_per_sq_ft ~ units * sq_ft + boro"
f3 = "value_per_sq_ft ~ units + sq_ft * boro + type"
f4 = "value_per_sq_ft ~ units + sq_ft * boro + sq_ft * type"
f5 = "value_per_sq_ft ~ boro + type"

# 学習用の行列
y1, X1 = dmatrices(f1, housing)
y2, X2 = dmatrices(f2, housing)
y3, X3 = dmatrices(f3, housing)
y4, X4 = dmatrices(f4, housing)
y5, X5 = dmatrices(f5, housing)

# 交差検証のスコア
model = LinearRegression()
scores1 = cross_val_score(model, X1, y1, cv = 5)
scores2 = cross_val_score(model, X2, y2, cv = 5)
scores3 = cross_val_score(model, X3, y3, cv = 5)
scores4 = cross_val_score(model, X4, y4, cv = 5)
scores5 = cross_val_score(model, X5, y5, cv = 5)

# モデル毎のスコアの平均値
import statistics
scores_series = (
    pl
    .Series("Scores", [scores1, scores2, scores3, scores4, scores5])
    .map_elements(lambda x: statistics.mean(x))
)
display(scores_series)