### 16.2.2 GLMモデルの比較

### データを読み込む

In [5]:
import polars as pl

housing = pl.read_csv("../data/housing_renamed.csv")
housing.head()

neighborhood,type,units,year_built,sq_ft,income,income_per_sq_ft,expense,expense_per_sq_ft,net_income,value,value_per_sq_ft,boro
str,str,i64,f64,i64,i64,f64,i64,f64,i64,i64,f64,str
"""FINANCIAL""","""R9-CONDOMINIUM…",42,1920.0,36500,1332615,36.51,342005,9.37,990610,7300000,200.0,"""Manhattan"""
"""FINANCIAL""","""R4-CONDOMINIUM…",78,1985.0,126420,6633257,52.47,1762295,13.94,4870962,30690000,242.76,"""Manhattan"""
"""FINANCIAL""","""RR-CONDOMINIUM…",500,,554174,17310000,31.24,3543000,6.39,13767000,90970000,164.15,"""Manhattan"""
"""FINANCIAL""","""R4-CONDOMINIUM…",282,1930.0,249076,11776313,47.28,2784670,11.18,8991643,67556006,271.23,"""Manhattan"""
"""TRIBECA""","""R4-CONDOMINIUM…",239,1985.0,219495,10004582,45.58,2783197,12.68,7221385,54320996,247.48,"""Manhattan"""


### 各モデルの学習

In [6]:
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [7]:
def deviance_table(*models):
    return pl.DataFrame({
        "df_residuals": [mod.df_resid for mod in models],
        "resid_stddev": [mod.deviance for mod in models],
        "df": [mod.df_model for mod in models],
        "deviance": [mod.deviance for mod in models]
    })


f1 = "value_per_sq_ft ~ units + sq_ft + boro"
f2 = "value_per_sq_ft ~ units * sq_ft + boro"
f3 = "value_per_sq_ft ~ units + sq_ft * boro + type"
f4 = "value_per_sq_ft ~ units + sq_ft * boro + sq_ft * type"
f5 = "value_per_sq_ft ~ boro + type"

glm1 = smf.glm(f1, data = housing).fit()
glm2 = smf.glm(f2, data = housing).fit()
glm3 = smf.glm(f3, data = housing).fit()
glm4 = smf.glm(f4, data = housing).fit()
glm5 = smf.glm(f5, data = housing).fit()

glm_anova = deviance_table(glm1, glm2, glm3, glm4, glm5)
glm_anova

df_residuals,resid_stddev,df,deviance
i64,f64,i64,f64
2619,4922400.0,6,4922400.0
2618,4884900.0,7,4884900.0
2612,4619900.0,13,4619900.0
2609,4576700.0,16,4576700.0
2618,4901500.0,7,4901500.0


### ロジスティック回帰におけるモデル比較

In [10]:
# 二値化
housing = (
    housing.with_columns([
        housing["value_per_sq_ft"].map_elements(lambda x: 1 if x >= 150 else 0).alias("high")
    ])
)
display(housing["high"].value_counts())

f1 = "high ~ units + sq_ft + boro"
f2 = "high ~ units * sq_ft + boro"
f3 = "high ~ units + sq_ft * boro + type"
f4 = "high ~ units + sq_ft * boro + sq_ft * type"
f5 = "high ~ boro + type"

logistic = statsmodels.genmod.families.family.Binomial(
    link = statsmodels.genmod.families.links.Logit()
)

glm1 = smf.glm(f1, data = housing, family = logistic).fit()
glm2 = smf.glm(f2, data = housing, family = logistic).fit()
glm3 = smf.glm(f3, data = housing, family = logistic).fit()
glm4 = smf.glm(f4, data = housing, family = logistic).fit()
glm5 = smf.glm(f5, data = housing, family = logistic).fit()

display(deviance_table(glm1, glm2, glm3, glm4, glm5))

high,counts
i64,u32
0,1619
1,1007


df_residuals,resid_stddev,df,deviance
i64,f64,i64,f64
2619,1695.631547,6,1695.631547
2618,1686.12674,7,1686.12674
2612,1636.49283,13,1636.49283
2609,1619.431515,16,1619.431515
2618,1666.615696,7,1666.615696


In [11]:
model_names = ["house1", "house2", "house3", "house4", "house5"]
mods = [glm1, glm2, glm3, glm4, glm5]

abic_glm = pl.DataFrame({
    "model": model_names,
    "aic": [mod.aic for mod in mods],
    "bic": [mod.bic for mod in mods]
})

abic_glm.sort(by = ["aic", "bic"])



model,aic,bic
str,f64,f64
"""house4""",1653.431515,-18921.791781
"""house3""",1664.49283,-18928.350118
"""house5""",1682.615696,-18945.466554
"""house2""",1702.12674,-18925.95551
"""house1""",1709.631547,-18924.32392
