### RFE 再帰的特徴消去を使用して有効な特徴量でロジスティック回帰でキノコの有毒 or 無毒分類

In [1]:
%matplotlib inline

import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

In [2]:
# Scikit-learn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [3]:
# スタイルと設定
sns.set(style="dark")
sns.set(style="darkgrid", color_codes=True)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

In [4]:
mushroom_data = pd.read_csv('mushrooms.csv')

In [5]:
mushroom_col = mushroom_data.columns[1:]
mushroom_data2 = pd.get_dummies(mushroom_data, columns=mushroom_data.columns[1:], drop_first=True)

In [6]:
mushroom_col

Index(['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [7]:
mushroom_data.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,2,5,4,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,t,b,s,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,4608,3776,5176,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [8]:
# 正解'class'の値を数値に変換
labelEncoder = LabelEncoder()
mushroom_data2['class'] = labelEncoder.fit_transform(mushroom_data2['class'])

In [9]:
# 訓練データとテストデータに分ける
train_set, test_set = train_test_split(mushroom_data2, test_size=0.2, random_state=42)

In [10]:
# 特徴量と正解に分ける
X_train = train_set.iloc[:, 1:]
y_train = train_set.iloc[:, :1]

X_test = test_set.iloc[:, 1:]
y_test = test_set.iloc[:, :1]

In [11]:
# RFEで特徴量選択
logreg = LogisticRegression()
rfe = RFE(logreg, 5, verbose=1)
rfe = rfe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


Fitting estimator with 95 features.
Fitting estimator with 94 features.
Fitting estimator with 93 features.
Fitting estimator with 92 features.
Fitting estimator with 91 features.
Fitting estimator with 90 features.
Fitting estimator with 89 features.
Fitting estimator with 88 features.
Fitting estimator with 87 features.
Fitting estimator with 86 features.
Fitting estimator with 85 features.
Fitting estimator with 84 features.
Fitting estimator with 83 features.
Fitting estimator with 82 features.




Fitting estimator with 81 features.
Fitting estimator with 80 features.
Fitting estimator with 79 features.
Fitting estimator with 78 features.
Fitting estimator with 77 features.
Fitting estimator with 76 features.
Fitting estimator with 75 features.




Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.
Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.




Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.




Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.




Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.




Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.




In [12]:
rfe

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
  n_features_to_select=5, step=1, verbose=1)

In [13]:
rfe.support_

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False,  True,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

In [14]:
# 選択した特徴量取り出し
X_train_sel = X_train[X_train.columns[rfe.support_]]
X_test_sel = X_test[X_test.columns[rfe.support_]]

In [15]:
X_train.columns[rfe.support_]

Index(['odor_c', 'odor_n', 'odor_p', 'spore-print-color_k',
       'spore-print-color_n'],
      dtype='object')

In [16]:
# odor_c = 匂い（クレオソート）
# odor_n = 匂い（無臭）
# odor_p = 匂い（刺激臭）
# spore-print-color_k = 胞子の色（黒）
# spore-print-color_n = 胞子の色（茶）

In [17]:
# 訓練データを使ってもdるの訓練
logclassifier = LogisticRegression()
logclassifier.fit(X_train_sel, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
# 訓練データで予測
y_pred = logclassifier.predict(X_train_sel)

In [19]:
# 混合行列で評価
cnf_matrix = confusion_matrix(y_train, y_pred)

In [20]:
cnf_matrix

array([[3331,   34],
       [  95, 3039]], dtype=int64)

In [21]:
# 正解率
accuracy_score(y_train, y_pred)

0.9801507924296046

In [22]:
# テストデータでの予測
y_pred_test = logclassifier.predict(X_test_sel)

In [23]:
# 混合行列で評価
cnf_matrix_test = confusion_matrix(y_test, y_pred_test)

In [24]:
cnf_matrix_test

array([[829,  14],
       [ 25, 757]], dtype=int64)

In [25]:
# 正解率
accuracy_score(y_test, y_pred_test)

0.976