In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# sickit-lean
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# XGBoost
import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
# データセットの読み込み
iris = datasets.load_iris()

In [3]:
type(iris)

sklearn.utils.Bunch

In [4]:
# クラス名
print(iris.target_names)

# ターゲットの値
print(set(iris.target))

['setosa' 'versicolor' 'virginica']
{0, 1, 2}


In [5]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target

iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [6]:
# 訓練データとテストデータに分割
train_set, test_set = train_test_split(iris_df, test_size=0.2, random_state=42)

In [7]:
train_set.shape, test_set.shape

((120, 5), (30, 5))

In [8]:
# 分割したデータを特徴量とターゲットに分割
X_train = train_set.drop('target', axis=1)
y_train = train_set['target'].copy()

X_test = test_set.drop('target', axis=1)
y_test = test_set['target'].copy()

In [9]:
# データを numpy => DMatrix に変換
d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

  if getattr(data, 'base', None) is not None and \


In [10]:
type(d_train), type(d_test)

(xgboost.core.DMatrix, xgboost.core.DMatrix)

In [11]:
# 訓練データの名前を確認
d_train.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [12]:
d_train.feature_types, d_test.feature_types

(['float', 'float', 'float', 'float'], ['float', 'float', 'float', 'float'])

In [13]:
# XGBoost のハイパーパラメータ
param = {
    'max_depth': 3,
    'eta': 0.3,
    'silent': 0,
    'objective': 'multi:softprob', # softmax
    'num_class': 3
}

In [14]:
# Boosting の反復回数 デフォルト 10
num_boost_round = 20

In [15]:
# XGBoost 訓練
xgboost_model = xgb.train(param, d_train, num_boost_round)

In [16]:
# 訓練データで推測
y_prob_train = xgboost_model.predict(d_train)

In [17]:
y_prob_train[0]

array([0.990855  , 0.00562044, 0.00352454], dtype=float32)

In [18]:
# 訓練データの先頭行
train_set.iloc[0]

sepal length (cm)    4.6
sepal width (cm)     3.6
petal length (cm)    1.0
petal width (cm)     0.2
target               0.0
Name: 22, dtype: float64

In [19]:
# 訓練データの 3行目
train_set.iloc[2]

sepal length (cm)    6.7
sepal width (cm)     3.1
petal length (cm)    4.4
petal width (cm)     1.4
target               1.0
Name: 65, dtype: float64

In [20]:
y_prob_train[2]

array([0.00569893, 0.98810893, 0.00619213], dtype=float32)

In [21]:
np.argmax(y_prob_train[0]), np.argmax(y_prob_train[2])

(0, 1)

In [22]:
y_pred = np.array([np.argmax(line) for line in y_prob_train])

In [23]:
y_pred[:5]

array([0, 0, 1, 0, 0], dtype=int64)

In [30]:
y_pred_ar = np.argmax(y_prob_train, axis=1)

In [31]:
y_pred_ar[:5]

array([0, 0, 1, 0, 0], dtype=int64)

In [32]:
# 混合行列を表示
confusion_matrix(y_train, y_pred, labels=[0, 1, 2])

array([[40,  0,  0],
       [ 0, 41,  0],
       [ 0,  0, 39]], dtype=int64)

In [34]:
confusion_matrix(y_train, y_pred_ar, labels=[0, 1, 2])

array([[40,  0,  0],
       [ 0, 41,  0],
       [ 0,  0, 39]], dtype=int64)

In [None]:
# テストデータで予測
y_prob_test = xgboost_model.predict(d_test)

In [None]:
# 確率からクラスに変換
y_pred_test = np.array([np.argmax(line) for line in y_prob_test])

In [None]:
# テストデータの混合行列
confusion_matrix(y_test, y_pred_test)

In [None]:
# テストデータの正解率
accuracy_score(y_test, y_pred_test)