# 第14章 一般化線形モデル

## 14.1 ロジスティック回帰

In [1]:
import seaborn as sns
titanic = sns.load_dataset("titanic")

In [2]:
titanic_sub = titanic[["survived", "sex", "age", "embarked"]].copy().dropna()
print(titanic_sub)

     survived     sex   age embarked
0           0    male  22.0        S
1           1  female  38.0        C
2           1  female  26.0        S
3           1  female  35.0        S
4           0    male  35.0        S
..        ...     ...   ...      ...
885         0  female  39.0        Q
886         0    male  27.0        S
887         1  female  19.0        S
889         1    male  26.0        C
890         0    male  32.0        Q

[712 rows x 4 columns]


In [3]:
print(titanic_sub["survived"].value_counts())

survived
0    424
1    288
Name: count, dtype: int64


In [4]:
print(titanic_sub["embarked"].value_counts())

embarked
S    554
C    130
Q     28
Name: count, dtype: int64


### 14.1.1 statsmodelsを使う

In [5]:
import statsmodels.formula.api as smf

form = "survived ~ sex + age + embarked"

py_logistic_smf = smf.logit(formula = form, data = titanic_sub).fit()

print(py_logistic_smf.summary())

Optimization terminated successfully.
         Current function value: 0.509889
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               survived   No. Observations:                  712
Model:                          Logit   Df Residuals:                      707
Method:                           MLE   Df Model:                            4
Date:                Sat, 16 Dec 2023   Pseudo R-squ.:                  0.2444
Time:                        22:41:47   Log-Likelihood:                -363.04
converged:                       True   LL-Null:                       -480.45
Covariance Type:            nonrobust   LLR p-value:                 1.209e-49
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         2.2046      0.322      6.851      0.000       1.574       2.835
sex[T.male]      -2.

In [6]:
import numpy as np
import pandas as pd

res_sm = pd.DataFrame(py_logistic_smf.params, columns = ["coefs_sm"])

res_sm["odds_sm"] = np.exp( res_sm["coefs_sm"] )

print(res_sm.round(3))

               coefs_sm  odds_sm
Intercept         2.205    9.066
sex[T.male]      -2.476    0.084
embarked[T.Q]    -1.816    0.163
embarked[T.S]    -1.007    0.365
age              -0.008    0.992


### 14.1.2 sklearnを使う

In [7]:
titanic_dummy = pd.get_dummies(
    titanic_sub[["survived", "sex", "age", "embarked"]],
    drop_first = True
)

print(titanic_dummy)

     survived   age  sex_male  embarked_Q  embarked_S
0           0  22.0      True       False        True
1           1  38.0     False       False       False
2           1  26.0     False       False        True
3           1  35.0     False       False        True
4           0  35.0      True       False        True
..        ...   ...       ...         ...         ...
885         0  39.0     False        True       False
886         0  27.0      True       False        True
887         1  19.0     False       False        True
889         1  26.0      True       False       False
890         0  32.0      True        True       False

[712 rows x 5 columns]


In [8]:
from sklearn import linear_model

py_logistic_sklearn = linear_model.LogisticRegression().fit(
    X = titanic_dummy.iloc[:, 1:],
    y = titanic_dummy.iloc[:, 0]
)

In [10]:
dummy_names = titanic_dummy.columns.to_list()

sk1_res1 = pd.DataFrame(
    py_logistic_sklearn.intercept_,
    index = ["Intercept"],
    columns = ["coef_sk1"],
)
sk1_res2 = pd.DataFrame(
    py_logistic_sklearn.coef_.T,
    index = dummy_names[1:],
    columns = ["coef_sk1"],
)

res_sklearn_pd_1 = pd.concat([sk1_res1, sk1_res2])

res_sklearn_pd_1["odds_sk1"] = np.exp(res_sklearn_pd_1["coef_sk1"])

print(res_sklearn_pd_1.round(3))


            coef_sk1  odds_sk1
Intercept      2.024     7.571
age           -0.008     0.992
sex_male      -2.372     0.093
embarked_Q    -1.369     0.254
embarked_S    -0.887     0.412


### 14.1.3 sklearnのデフォルトに注意

In [12]:
py_logistic_sklearn2 = linear_model.LogisticRegression(
    penalty = None
    )\
        .fit(
            X = titanic_dummy.iloc[:, 1:],
            y = titanic_dummy.iloc[:, 0]
            )

sk2_res1 = pd.DataFrame(
    py_logistic_sklearn2.intercept_,
    index = ["Intercept"],
    columns = ["coef_sk1"],
)
sk2_res2 = pd.DataFrame(
    py_logistic_sklearn2.coef_.T,
    index = dummy_names[1:],
    columns = ["coef_sk1"],
)

res_sklearn_pd_2 = pd.concat([sk2_res1, sk1_res2])

res_sklearn_pd_2["odds_sk1"] = np.exp(res_sklearn_pd_2["coef_sk1"])

print(res_sklearn_pd_2.round(3))

            coef_sk1  odds_sk1
Intercept      2.205     9.066
age           -0.008     0.992
sex_male      -2.372     0.093
embarked_Q    -1.369     0.254
embarked_S    -0.887     0.412


In [13]:
sm_results = res_sm.round(3)

sm_results = sm_results.sort_index()

print(sm_results)

               coefs_sm  odds_sm
Intercept         2.205    9.066
age              -0.008    0.992
embarked[T.Q]    -1.816    0.163
embarked[T.S]    -1.007    0.365
sex[T.male]      -2.476    0.084


In [14]:
sk_results = pd.concat(
    [res_sklearn_pd_1.round(3), res_sklearn_pd_2.round(3)],
    axis = "columns",
)

sk_results = sk_results[sk_results.columns.sort_values()]
sk_results = sk_results.sort_index()

print(sk_results)

            coef_sk1  coef_sk1  coef_sk1  coef_sk1  odds_sk1  odds_sk1  \
Intercept      2.024     2.205     2.024     2.205     7.571     9.066   
age           -0.008    -0.008    -0.008    -0.008     0.992     0.992   
embarked_Q    -1.369    -1.369    -1.369    -1.369     0.254     0.254   
embarked_S    -0.887    -0.887    -0.887    -0.887     0.412     0.412   
sex_male      -2.372    -2.372    -2.372    -2.372     0.093     0.093   

            odds_sk1  odds_sk1  
Intercept      7.571     9.066  
age            0.992     0.992  
embarked_Q     0.254     0.254  
embarked_S     0.412     0.412  
sex_male       0.093     0.093  
