## ホ16.3 k分割交差検証の前半

### データを読み込む

In [10]:
import polars as pl

housing = pl.read_csv("../data/housing_renamed.csv")
# ダミー変数の調整用にソートする
housing = housing.sort(by = ["units", "sq_ft", "boro"])
display(housing.head())
display(housing.columns)

neighborhood,type,units,year_built,sq_ft,income,income_per_sq_ft,expense,expense_per_sq_ft,net_income,value,value_per_sq_ft,boro
str,str,i64,f64,i64,i64,f64,i64,f64,i64,i64,f64,str
"""UPPER WEST SID…","""R4-CONDOMINIUM…",1,1991.0,945,40000,42.33,14000,14.81,26000,184000,194.71,"""Manhattan"""
"""FLUSHING-NORTH…","""R4-CONDOMINIUM…",1,1967.0,1075,17200,16.0,8288,7.71,8912,59000,54.88,"""Queens"""
"""SOHO""","""R2-CONDOMINIUM…",1,1941.0,1158,50327,43.46,10156,8.77,40171,303000,261.66,"""Manhattan"""
"""FLATIRON""","""R4-CONDOMINIUM…",1,1920.0,1800,65178,36.21,17928,9.96,47250,328000,182.22,"""Manhattan"""
"""ALPHABET CITY""","""R4-CONDOMINIUM…",1,1920.0,1996,64870,32.5,23772,11.91,41098,306000,153.31,"""Manhattan"""


['neighborhood',
 'type',
 'units',
 'year_built',
 'sq_ft',
 'income',
 'income_per_sq_ft',
 'expense',
 'expense_per_sq_ft',
 'net_income',
 'value',
 'value_per_sq_ft',
 'boro']

### 訓練データと検証データに分割する

In [11]:
from sklearn.model_selection import KFold, cross_val_score
from patsy import dmatrices

# 分割数の設定
kf = KFold(n_splits = 5)

# 分割用の行列の初期化
y, X = dmatrices("value_per_sq_ft ~ units + sq_ft + boro", housing)

### 学習する

In [12]:
from sklearn.linear_model import LinearRegression

coefs = []
scores = []
for train, eval in kf.split(X):
    X_train, X_eval = X[train], X[eval]
    y_train, y_eval = y[train], y[eval]
    lr = LinearRegression().fit(X_train, y_train)
    coefs.append(pl.DataFrame(lr.coef_))
    scores.append(lr.score(X_eval, y_eval))

### 学習結果を確認する

In [13]:
# 係数
coefs_df = pl.concat(items = coefs)
coefs_df.columns = X.design_info.column_names
display(coefs_df)

# 評価関数
display(scores)

Intercept,boro[T.Brooklyn],boro[T.Manhattan],boro[T.Queens],boro[T.Staten Island],units,sq_ft
f64,f64,f64,f64,f64,f64,f64
0.0,35.314215,133.885329,32.587985,-2.528174,-0.180871,0.0002
0.0,36.549522,133.770284,33.932723,-1.5758,-0.184037,0.000206
0.0,36.018835,134.952098,31.043325,-2.474525,-0.171061,0.000191
0.0,33.87511,130.337231,35.58652,-3.960741,-0.168459,0.000191
0.0,30.352287,119.734957,32.794791,-7.41829,-0.433453,0.000435


[0.5536214314317605,
 0.6095985679095276,
 0.49865235670438945,
 0.58304423894473,
 0.4529896844633885]

### 交差検証のスコアを計算する短いコード

In [14]:
model = LinearRegression()
scores = cross_val_score(model, X, y, cv = 5)
display(scores)

array([0.55362143, 0.60959857, 0.49865236, 0.58304424, 0.45298968])

### モデル別のスコアを比較する

In [24]:
# リンク関数
f1 = "value_per_sq_ft ~ units + sq_ft + boro"
f2 = "value_per_sq_ft ~ units * sq_ft + boro"
f3 = "value_per_sq_ft ~ units + sq_ft * boro + type"
f4 = "value_per_sq_ft ~ units + sq_ft * boro + sq_ft * type"
f5 = "value_per_sq_ft ~ boro + type"

# 学習用の行列
y1, X1 = dmatrices(f1, housing)
y2, X2 = dmatrices(f2, housing)
y3, X3 = dmatrices(f3, housing)
y4, X4 = dmatrices(f4, housing)
y5, X5 = dmatrices(f5, housing)

# 交差検証のスコア
model = LinearRegression()
scores1 = cross_val_score(model, X1, y1, cv = 5)
scores2 = cross_val_score(model, X2, y2, cv = 5)
scores3 = cross_val_score(model, X3, y3, cv = 5)
scores4 = cross_val_score(model, X4, y4, cv = 5)
scores5 = cross_val_score(model, X5, y5, cv = 5)

# モデル毎のスコアの平均値
import statistics
scores_series = (
    pl
    .Series("Scores", [scores1, scores2, scores3, scores4, scores5])
    .map_elements(lambda x: statistics.mean(x))
)
display(scores_series)

Scores
f64
0.539581
-11.104634
0.412657
0.418591
0.546768
