## 16.2 複数のモデルを比較する
### 16.2.1 線形モデルの比較

### データを読み込む

In [None]:
import polars as pl

housing = pl.read_csv("../data/housing_renamed.csv")
housing.head()

### 各モデルの学習

In [None]:
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
f1 = "value_per_sq_ft ~ units + sq_ft + boro"
f2 = "value_per_sq_ft ~ units * sq_ft + boro"
f3 = "value_per_sq_ft ~ units + sq_ft * boro + type"
f4 = "value_per_sq_ft ~ units + sq_ft * boro + sq_ft * type"
f5 = "value_per_sq_ft ~ boro + type"

house1 = smf.ols(f1, data = housing).fit()
house2 = smf.ols(f2, data = housing).fit()
house3 = smf.ols(f3, data = housing).fit()
house4 = smf.ols(f4, data = housing).fit()
house5 = smf.ols(f5, data = housing).fit()

In [None]:
house1.params

In [None]:
house1.params.index.tolist()

In [None]:
# 長さの異なるデータを横に結合したい場合は、pandasの方が簡単
import pandas as pd

mod_results = (
    pd.concat([
        house1.params,
        house2.params,
        house3.params,
        house4.params,
        house5.params
    ],
    axis = 1)
    .rename(columns = lambda col: "house" + str(col + 1))
    .reset_index()
    .rename(columns = {"index": "param"})
    .melt(id_vars = "param", var_name = "model", value_name = "estimate")
)
mod_results

### モデル毎の残差を可視化する

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# 層毎の色分け
color_dict = dict(
    {
        "house1": "#d7191c",
        "house2": "#fdae61",
        "house3": "#ffffbf",
        "house4": "#abdda4",
        "house5": "#2b83ba",
    }
)

fig, ax = plt.subplots()
ax = sns.pointplot(
    x = "estimate",
    y = "param",
    hue = "model",
    data = mod_results,
    dodge = True,
    linestyle = 'none',
    palette = color_dict,
)

plt.tight_layout()
plt.show()

### ANOVAを用いて、モデルを評価する

In [None]:
model_names = ["house1", "house2", "house3", "house4", "house5"]
house_anova = statsmodels.stats.anova.anova_lm(
    house1, house2, house3, house4, house5
)

house_anova.index = model_names

house_anova

### AICを用いてモデルを評価する

In [None]:
house_models = [house1, house2, house3, house4, house5]

abic = pd.DataFrame({
    "model": model_names,
    "aic": [mod.aic for mod in house_models],
    "bic": [mod.bic for mod in house_models]
})

abic.sort_values(by = ["aic", "bic"])