In [2]:
# ライブラリのロード
import pandas as pd
from io import StringIO
import numpy as np

In [3]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                      'ml/machine-learning-databases/wine/wine.data',
                      header=None)

df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']

print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()

Class labels [1 2 3]


Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
from sklearn.base import clone
from itertools import combinations
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score



In [5]:
class SBS():
    
    def __init__(self, estimator, k_features, scoring=accuracy_score, test_size=0.25, random_state=1):
        self.scoring = scoring # 特徴量を評価する指標
        self.estimator = clone(estimator) # 推定器
        self.k_features = k_features # 選択する特徴量の個数
        self.test_size = test_size # テストデータの割合
        self.random_state = random_state # 乱数種を固定するrandom_state
        
    def fit(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)
        # 全ての特徴量の個数, 列インデックス
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        # 全ての特徴量を用いてスコアを算出
        score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)
        # スコアを格納
        self.scores_ = [score]
        # 指定した特徴量の個数になるまで処理を反復
        while dim > self.k_features:
            # 空のリストの生成（スコア, 列インデックス）
            scores = []
            subsets = []
            
            # 特徴量の部分集合を表す列インデックスの組み合わせごとに処理を反復
            for p in combinations(self.indices_, r=dim-1):
                # スコアを算出して格納
                score = self._calc_score(X_train, y_train, X_test, y_test, p)
                scores.append(score)
                # 特徴量の部分集合を表す列インデックスのリストを格納
                subsets.append(p)
                
            # 最良のスコアのインデックスを抽出
            best = np.argmax(scores)
            # 最良のスコアとなる列インデックスを抽出して格納
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            # 特徴量の個数を１つだけ減らして次のステップへ
            dim -= 1
            
            # スコアを格納
            self.scores_.append(scores[best])
            
        # 最後に格納したスコア
        self.k_score_ = self.scores_[-1]
        
        return self
    
    def transform(self, X):
        # 抽出した特徴量を返す
        return X[:, self.indices_]
    
    def _calc_score(self, X_train, y_train, X_test, y_test, indices):
        # 指定された列番号indicesの特徴量を抽出してモデルに適合
        self.estimator.fit(X_train[:, indices], y_train)
        # テストデータを用いてクラスラベルを予測
        y_pred = self.estimator.predict(X_test[:, indices])
        # 真のクラスラベルと予測値を用いてスコアを算出
        score = self.scoring(y_test, y_pred)
        return score

In [6]:
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

In [7]:
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [9]:
X_train.shape

(124, 13)

In [10]:
dim = X_train.shape[1]

In [11]:
dim

13

In [12]:
indices_ = tuple(range(dim))

In [13]:
indices_

(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)

In [14]:
subsets_ = [indices_]

In [15]:
subsets_

[(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)]

In [16]:
# k近傍分類器のインスタンスを生成（近傍点=2）
knn = KNeighborsClassifier(n_neighbors=2)

def _calc_score(X_train, y_train, X_test, y_test, indices, estimator):
        # 指定された列番号indicesの特徴量を抽出してモデルに適合
        estimator.fit(X_train[:, indices], y_train)
        # テストデータを用いてクラスラベルを予測
        y_pred = estimator.predict(X_test[:, indices])
        # 真のクラスラベルと予測値を用いてスコアを算出
        score = scoring(y_test, y_pred)
        return score

In [17]:
arr = np.array([[1,2,3], [4,5,6]])

In [18]:
dim = arr.shape[1]

In [19]:
indices = tuple(range(dim))

In [20]:
indices

(0, 1, 2)

In [21]:
for p in combinations(indices, dim-1):
    
    print(p)

(0, 1)
(0, 2)
(1, 2)


In [22]:
dim = X_train.shape[1]
indices = tuple(range(dim))
subsets = [indices]
print(dim)
print(indices)
print(subsets)

13
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
[(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)]


In [23]:
from sklearn.metrics import accuracy_score
scoring = accuracy_score

scores = []

# 特徴量の部分集合を表す列インデックスの組み合わせごとに処理を反復
for p in combinations(indices, r=dim-1):
    
    print(p)
    # スコアを算出して格納
    score = _calc_score(X_train, y_train, X_test, y_test, p, knn)
    scores.append(score)
    # 特徴量の部分集合を表す列インデックスのリストを格納
    subsets.append(p)

(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12)
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12)
(0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12)
(0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12)
(0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12)
(0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12)
(0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12)
(0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12)
(0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12)
(0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
(0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)


In [24]:
scores

[0.79629629629629628,
 0.66666666666666663,
 0.66666666666666663,
 0.66666666666666663,
 0.66666666666666663,
 0.66666666666666663,
 0.66666666666666663,
 0.66666666666666663,
 0.7407407407407407,
 0.68518518518518523,
 0.66666666666666663,
 0.66666666666666663,
 0.66666666666666663]

In [25]:
# 最良のスコアのインデックスを抽出
best = np.argmax(scores)
best

0

In [26]:
indices = subsets[best]
indices

(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)

In [27]:
subsets

[(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12),
 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12),
 (0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12),
 (0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12),
 (0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12),
 (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12),
 (0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12),
 (0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12),
 (0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12),
 (0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
 (0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
 (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)]

In [28]:
for t in subsets:
    
    print(len(t))

13
12
12
12
12
12
12
12
12
12
12
12
12
12


## <font color='blue'>SBSのプログラムの流れを追う</font>

変数の説明

dim : 特徴量
indices_ : 特徴量（並び替える）
subsets_ : 特徴量（結果が一番よくなる特徴量）

score : 固定された特徴量の次元での結果
scores : 固定された特徴量の次元での結果を全て入れたもの
subsets : 固定された特徴量の次元において, 特徴量の並び順

In [29]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                      'ml/machine-learning-databases/wine/wine.data',
                      header=None)

df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']

print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

Class labels [1 2 3]


In [30]:
# k近傍分類器のインスタンスを生成（近傍点=2）
knn = KNeighborsClassifier(n_neighbors=2)

def _calc_score(X_train, y_train, X_test, y_test, indices, estimator):
        # 指定された列番号indicesの特徴量を抽出してモデルに適合
        estimator.fit(X_train[:, indices], y_train)
        # テストデータを用いてクラスラベルを予測
        y_pred = estimator.predict(X_test[:, indices])
        # 真のクラスラベルと予測値を用いてスコアを算出
        score = scoring(y_test, y_pred)
        return score

### <font color='blue'>まずは全ての特徴量を用いてスコアを算出する</font>

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [32]:
dim = X_train.shape[1] # 次元
indices_ = tuple(range(dim)) # combinationsで用いる
subsets_ = [indices_] # indices_をリストに変換

In [33]:
dim, indices_, subsets_

(13,
 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
 [(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)])

In [34]:
# 全ての特徴量を用いてスコアを算出
score = _calc_score(X_train, y_train, X_test, y_test, indices_, knn)

In [35]:
# 全ての特徴量を用いてスコアを算出した
score

0.66666666666666663

In [36]:
scores_ = [score]

In [37]:
scores_

[0.66666666666666663]

### <font color='blue'>以下からは次元を削除しながらスコアを算出する</font>

In [38]:
scores = []
subsets = [] # 特徴量が入る

In [39]:
i = 0
# まずは特徴量を１つ削除する
for p in combinations(indices_, r=dim-1):
    
    print(i)
    print(p)
    # スコアを算出
    score = _calc_score(X_train, y_train, X_test, y_test, p, knn)
    scores.append(score)
    # 特徴量の部分集合を保存
    subsets.append(p)
    i += 1

0
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
1
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12)
2
(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12)
3
(0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12)
4
(0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12)
5
(0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12)
6
(0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12)
7
(0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12)
8
(0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12)
9
(0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12)
10
(0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
11
(0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
12
(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)


In [40]:
scores, len(scores)

([0.85185185185185186,
  0.66666666666666663,
  0.66666666666666663,
  0.66666666666666663,
  0.66666666666666663,
  0.66666666666666663,
  0.66666666666666663,
  0.66666666666666663,
  0.7407407407407407,
  0.66666666666666663,
  0.66666666666666663,
  0.66666666666666663,
  0.66666666666666663],
 13)

In [41]:
best = np.argmax(scores)
best

0

In [42]:
indices_ = subsets[best]

In [43]:
# １２番目の特徴量が削除されている
indices_

(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)

In [44]:
subsets_.append(indices_)

In [45]:
subsets_

[(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)]

In [46]:
scores_.append(scores[best])

In [47]:
scores_

[0.66666666666666663, 0.85185185185185186]

In [49]:
dim -= 1