### Mushroom Classificationデータセットを使用して、キノコが有毒 or 無毒をロジスティック回帰で予測

In [1]:
# ロジスティック回帰

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import LabelEncoder

In [3]:
# pandas の表示設定
pd.set_option('max_columns', 50)
pd.set_option('max_rows', 500000)

In [4]:
mushroom_data = pd.read_csv('mushrooms.csv')

In [5]:
# 'class', 'gill-size', 'bruises'のみ取り出し
mushroom_data = mushroom_data[['class', 'gill-size', 'bruises']]

In [6]:
# カラム情報を引き継いでダミー変数へ変換したmushroom2を作成
# 'gill-size', 'bruises'カラム
mushroomcol = mushroom_data.columns[1:]
# カテゴリ変数をダミー変数に変換
# drop_first：最初のダミー変数を除外するか
mushroom_data2 = pd.get_dummies(mushroom_data, columns=mushroomcol, drop_first=True)

In [7]:
mushroom_data2.shape

(8124, 3)

In [8]:
# LabelEncoder関数を使い「class」の文字列を数値へ変換
labelEncoder = LabelEncoder()
# 'class' 列をエンコード
mushroom_data2['class'] = labelEncoder.fit_transform(mushroom_data2['class'])

In [14]:
# statsmodelを利用してロジスティック回帰のモデルを構築
# 切片を使用。'const'列を追加
mushroom_data2 = sm.add_constant(mushroom_data2)

In [18]:
# 「bruises（あざ）」の有無を特徴量として、キノコの「class」（毒性か食用か）を予測するロジスティック回帰のモデル
logit = sm.Logit(mushroom_data2['class'], mushroom_data2[['const', 'bruises_t']])
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.559154
         Iterations 5


In [20]:
result.summary()

0,1,2,3
Dep. Variable:,class,No. Observations:,8124.0
Model:,Logit,Df Residuals:,8122.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 30 Dec 2019",Pseudo R-squ.:,0.1926
Time:,11:28:09,Log-Likelihood:,-4542.6
converged:,True,LL-Null:,-5625.9
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.8158,0.031,25.920,0.000,0.754,0.877
bruises_t,-2.2997,0.054,-42.295,0.000,-2.406,-2.193


In [25]:
test1 = 1 / (1 + np.exp(-1 * (0.8158 + (-2.2997 * 1))))

In [26]:
round(test1 * 100, 2)

18.48

In [30]:
# 今度は「bruises（あざ）」の有無と「gill-size（ひだのサイズ）」を特徴量とする
logit2 = sm.Logit(mushroom_data2['class'], mushroom_data2[['const', 'bruises_t', 'gill-size_n']])
result2 = logit2.fit()

Optimization terminated successfully.
         Current function value: 0.463890
         Iterations 6


In [31]:
# summry()の coef（コエフィシエント、計数）
result2.summary()

0,1,2,3
Dep. Variable:,class,No. Observations:,8124.0
Model:,Logit,Df Residuals:,8121.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 30 Dec 2019",Pseudo R-squ.:,0.3301
Time:,11:43:57,Log-Likelihood:,-3768.6
converged:,True,LL-Null:,-5625.9
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0046,0.038,0.121,0.904,-0.070,0.079
bruises_t,-1.9086,0.060,-31.721,0.000,-2.027,-1.791
gill-size_n,2.5106,0.073,34.363,0.000,2.367,2.654


In [41]:
test2 = 1 / (1 + np.exp((-1) * (.0046 + (-1.9086 * 1) + (2.5106 * 0))))

In [42]:
round(test2 * 100, 2)

12.97