# 第13章 線形回帰モデル
pandasで書いている

## 13.1 単純な線形回帰

In [2]:
import pandas as pd
import seaborn as sns

tips = sns.load_dataset("tips")
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


### 13.1.1 statsmodels(Pythonの統計ライブラリ)を使う

In [3]:
import statsmodels.formula.api as smf

In [4]:
model = smf.ols(formula = "tip ~ total_bill", data = tips)

In [5]:
results = model.fit()

In [6]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    tip   R-squared:                       0.457
Model:                            OLS   Adj. R-squared:                  0.454
Method:                 Least Squares   F-statistic:                     203.4
Date:                Sat, 16 Dec 2023   Prob (F-statistic):           6.69e-34
Time:                        21:58:35   Log-Likelihood:                -350.54
No. Observations:                 244   AIC:                             705.1
Df Residuals:                     242   BIC:                             712.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.9203      0.160      5.761      0.0

In [7]:
print(results.params)

Intercept     0.920270
total_bill    0.105025
dtype: float64


### 13,1.2 sklearn(Pythonの機械学習ライブラリ)を使う

In [8]:
from sklearn import linear_model

In [9]:
lr = linear_model.LinearRegression()

In [10]:
predicted = lr.fit(
    X = tips["total_bill"].values.reshape(-1, 1), y = tips["tip"]
)
print(predicted.coef_)
print(predicted.intercept_)

[0.10502452]
0.9202696135546731


## 13.2 重回帰

### 13.2.1 statsmodelsを使う

In [11]:
model = smf.ols(formula = "tip ~ total_bill + size", data = tips).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    tip   R-squared:                       0.468
Model:                            OLS   Adj. R-squared:                  0.463
Method:                 Least Squares   F-statistic:                     105.9
Date:                Sat, 16 Dec 2023   Prob (F-statistic):           9.67e-34
Time:                        21:58:36   Log-Likelihood:                -347.99
No. Observations:                 244   AIC:                             702.0
Df Residuals:                     241   BIC:                             712.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.6689      0.194      3.455      0.0

### 13.2.2 sklearnを使う

In [12]:
lr = linear_model.LinearRegression()
predicted = lr.fit(
    X = tips[["total_bill", "size"]], y = tips["tip"]
)
print(predicted.coef_)
print(predicted.intercept_)

[0.09271334 0.19259779]
0.6689447408125022


## 13.3 カテゴリ変数を使うモデル

In [13]:
print(tips.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB
None


In [14]:
print(tips.sex.unique())

['Female', 'Male']
Categories (2, object): ['Male', 'Female']


### 13.3.1 statsmodelsでカテゴリ変数を使う

In [15]:
model = smf.ols(formula = "tip ~ total_bill + size + sex + smoker + day + time", data = tips).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    tip   R-squared:                       0.470
Model:                            OLS   Adj. R-squared:                  0.452
Method:                 Least Squares   F-statistic:                     26.06
Date:                Sat, 16 Dec 2023   Prob (F-statistic):           1.20e-28
Time:                        21:58:36   Log-Likelihood:                -347.48
No. Observations:                 244   AIC:                             713.0
Df Residuals:                     235   BIC:                             744.4
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.5908      0.256      2.

In [16]:
tips.day.unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

### 13.3.2 sklearnでカテゴリ変数を使う

#### 13.3.2.1 pandasにおけるダミー変数

In [17]:
tips_dummy = pd.get_dummies(
    tips[["total_bill", "size", "sex", "smoker", "day", "time"]]
)
print(tips_dummy)

     total_bill  size  sex_Male  sex_Female  smoker_Yes  smoker_No  day_Thur  \
0         16.99     2     False        True       False       True     False   
1         10.34     3      True       False       False       True     False   
2         21.01     3      True       False       False       True     False   
3         23.68     2      True       False       False       True     False   
4         24.59     4     False        True       False       True     False   
..          ...   ...       ...         ...         ...        ...       ...   
239       29.03     3      True       False       False       True     False   
240       27.18     2     False        True        True      False     False   
241       22.67     2      True       False        True      False     False   
242       17.82     2      True       False       False       True     False   
243       18.78     2     False        True       False       True      True   

     day_Fri  day_Sat  day_Sun  time_Lu

In [18]:
x_tips_dummy_ref = pd.get_dummies(
    tips[["total_bill", "size", "sex", "smoker", "day", "time"]],
    drop_first = True
)
print(x_tips_dummy_ref)

     total_bill  size  sex_Female  smoker_No  day_Fri  day_Sat  day_Sun  \
0         16.99     2        True       True    False    False     True   
1         10.34     3       False       True    False    False     True   
2         21.01     3       False       True    False    False     True   
3         23.68     2       False       True    False    False     True   
4         24.59     4        True       True    False    False     True   
..          ...   ...         ...        ...      ...      ...      ...   
239       29.03     3       False       True    False     True    False   
240       27.18     2        True      False    False     True    False   
241       22.67     2       False      False    False     True    False   
242       17.82     2       False       True    False     True    False   
243       18.78     2        True       True    False    False    False   

     time_Dinner  
0           True  
1           True  
2           True  
3           True  
4   

In [19]:
lr = linear_model.LinearRegression()
predicted = lr.fit(X = x_tips_dummy_ref, y = tips["tip"])
print(predicted.coef_)
print(predicted.intercept_)

[ 0.09448701  0.175992    0.03244094  0.08640832  0.1622592   0.04080082
  0.13677854 -0.0681286 ]
0.5908374259513769


### 13.3.2.2 sklearnでインデックスラベルを残す

In [20]:
import numpy as np

lr = linear_model.LinearRegression()
predicted = lr.fit(X = x_tips_dummy_ref, y = tips["tip"])

values = np.append(predicted.intercept_, predicted.coef_)

names = np.append("intercept", x_tips_dummy_ref.columns)

results = pd.DataFrame({"variables": names, "coef": values})

print(results)

     variables      coef
0    intercept  0.590837
1   total_bill  0.094487
2         size  0.175992
3   sex_Female  0.032441
4    smoker_No  0.086408
5      day_Fri  0.162259
6      day_Sat  0.040801
7      day_Sun  0.136779
8  time_Dinner -0.068129


## 13.4 sklearnのone-hotエンコーディングと、トランスフォーマーのパイプライン

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [23]:
categorical_features = ["sex", "smoker", "day", "time"]
categorical_transformer = OneHotEncoder(drop = "first")

preprocessor = ColumnTransformer(
    transformers = [
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder = "passthrough",
)

In [24]:
pipe = Pipeline(
    steps = [
        ("preprocessor", preprocessor),
        ("lr", linear_model.LinearRegression()),
    ]
)

In [25]:
pipe.fit(
    X = tips[["total_bill", "size", "sex", "smoker", "day", "time"]],
    y = tips["tip"],
)

In [26]:
print(type(pipe))

<class 'sklearn.pipeline.Pipeline'>


In [28]:
coefficients = np.append(
    pipe.named_steps["lr"].intercept_, pipe.named_steps["lr"].coef_
)

labels = np.append(
    ["intercept"], pipe[:-1].get_feature_names_out()
)

coefs = pd.DataFrame({"variables": labels, "coef": coefficients})

print(coefs)

               variables      coef
0              intercept  0.803817
1          cat__sex_Male -0.032441
2        cat__smoker_Yes -0.086408
3           cat__day_Sat -0.121458
4           cat__day_Sun -0.025481
5          cat__day_Thur -0.162259
6        cat__time_Lunch  0.068129
7  remainder__total_bill  0.094487
8        remainder__size  0.175992
