## 13.3 カテゴリ変数を使うモデル
カテゴリ変数を用いた```statsmodels```と```sklearn```による重回帰分析のスニペット。

```polars```の```to_dummies()```関数は数値型もダミー変数にするので注意。

### (共通)データの読み込み
```seaborn```の```tips```データセットを用いる。

In [1]:
import polars as pl
import seaborn as sns

# データの読み込み
tips = pl.DataFrame(sns.load_dataset("tips"))
display(tips.head())

# データ型を表示する
display(tips.columns)
display(tips.dtypes)

# カテゴリ変数のユニーク値を表示する
display(tips["sex"].unique().to_numpy())
display(tips["smoker"].unique().to_numpy())
display(tips["day"].unique().to_numpy())
display(tips["time"].unique().to_numpy())

total_bill,tip,sex,smoker,day,time,size
f64,f64,cat,cat,cat,cat,i64
16.99,1.01,"""Female""","""No""","""Sun""","""Dinner""",2
10.34,1.66,"""Male""","""No""","""Sun""","""Dinner""",3
21.01,3.5,"""Male""","""No""","""Sun""","""Dinner""",3
23.68,3.31,"""Male""","""No""","""Sun""","""Dinner""",2
24.59,3.61,"""Female""","""No""","""Sun""","""Dinner""",4


['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

[Float64, Float64, Categorical, Categorical, Categorical, Categorical, Int64]

array(['Male', 'Female'], dtype=object)

array(['Yes', 'No'], dtype=object)

array(['Thur', 'Fri', 'Sat', 'Sun'], dtype=object)

array(['Lunch', 'Dinner'], dtype=object)

### 13.2.1 statsmodels

In [2]:
import statsmodels.formula.api as smf

# モデルを学習する
model = smf.ols(formula = "tip ~ total_bill + size + sex * smoker + day + time", data = tips).fit()

# 学習結果を表示する
display(model.summary())

# 係数を表示する
display(model.params)

# 信頼区間を表示する
display(model.conf_int())

0,1,2,3
Dep. Variable:,tip,R-squared:,0.477
Model:,OLS,Adj. R-squared:,0.457
Method:,Least Squares,F-statistic:,23.7
Date:,"Mon, 01 Jan 2024",Prob (F-statistic):,1.49e-28
Time:,16:09:48,Log-Likelihood:,-345.92
No. Observations:,244,AIC:,711.8
Df Residuals:,234,BIC:,746.8
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6463,0.363,1.782,0.076,-0.068,1.361
sex[T.Male],0.1573,0.178,0.882,0.379,-0.194,0.509
smoker[T.Yes],0.2350,0.236,0.997,0.320,-0.229,0.699
day[T.Sat],-0.1135,0.308,-0.368,0.713,-0.721,0.494
day[T.Sun],-0.0151,0.320,-0.047,0.962,-0.646,0.615
day[T.Thur],-0.1095,0.393,-0.279,0.781,-0.884,0.665
time[T.Lunch],0.0565,0.443,0.128,0.899,-0.816,0.929
sex[T.Male]:smoker[T.Yes],-0.4929,0.284,-1.737,0.084,-1.052,0.066
total_bill,0.0956,0.010,9.977,0.000,0.077,0.114

0,1,2,3
Omnibus:,29.502,Durbin-Watson:,2.106
Prob(Omnibus):,0.0,Jarque-Bera (JB):,59.303
Skew:,0.617,Prob(JB):,1.33e-13
Kurtosis:,5.076,Cond. No.,206.0


Intercept                    0.646304
sex[T.Male]                  0.157285
smoker[T.Yes]                0.234998
day[T.Sat]                  -0.113541
day[T.Sun]                  -0.015137
day[T.Thur]                 -0.109541
time[T.Lunch]                0.056520
sex[T.Male]:smoker[T.Yes]   -0.492921
total_bill                   0.095595
size                         0.174441
dtype: float64

Unnamed: 0,0,1
Intercept,-0.068314,1.360922
sex[T.Male],-0.194146,0.508715
smoker[T.Yes],-0.229375,0.699371
day[T.Sat],-0.721246,0.494165
day[T.Sun],-0.645554,0.615281
day[T.Thur],-0.88362,0.664538
time[T.Lunch],-0.81581,0.928849
sex[T.Male]:smoker[T.Yes],-1.052115,0.066273
total_bill,0.076718,0.114473
size,-0.0012,0.350081


### 13.1.2 sklearn

In [3]:
from sklearn import linear_model

# ダミー変数を作成する
import polars.selectors as cs
# 数値型は外しておく
df_numeric = tips.select(cs.numeric()).drop("tip")
# カテゴリ変数のみダミー変数を作成する
df_categorical = tips.select(cs.categorical())
df_dummy = df_categorical.to_dummies(drop_first = True)
# 学習用のDataFrameを作成する
df_study = pl.concat(items = [df_numeric, df_dummy], how = "horizontal")
display(df_study.head())

# モデルを学習する
lr = linear_model.LinearRegression()
model = lr.fit(X = df_study, y = tips["tip"])

# 係数と切片を表示する
display(model.coef_)
display(model.intercept_)

total_bill,size,sex_Male,smoker_Yes,day_Fri,day_Sat,day_Thur,time_Lunch
f64,i64,u8,u8,u8,u8,u8,u8
16.99,2,0,0,0,0,0,0
10.34,3,1,0,0,0,0,0
21.01,3,1,0,0,0,0,0
23.68,2,1,0,0,0,0,0
24.59,4,0,0,0,0,0,0


array([ 0.09448701,  0.175992  , -0.03244094, -0.08640832,  0.02548066,
       -0.09597772, -0.13677854,  0.0681286 ])

0.7783366231954139