In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [2]:
pd.__version__, sklearn.__version__

('0.23.4', '0.20.1')

In [3]:
# アイリスデータセットの読み込み
iris_data = datasets.load_iris()

In [4]:
type(iris_data)

sklearn.utils.Bunch

In [5]:
# データフレームに変換
iris_df = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
iris_df['target'] = iris_data.target

In [6]:
iris_train, iris_test = train_test_split(iris_df, test_size=0.2, random_state=42)

In [7]:
# 特徴量と正解の分割
X_train = iris_train.iloc[:, :4]
y_train = iris_train.iloc[:, 4:]

X_test = iris_test.iloc[:, :4]
y_test = iris_test.iloc[:, 4:]

In [8]:
# Scikit-learnで特徴量の正規化
standardScaler = StandardScaler()
X_train_normal = standardScaler.fit_transform(X_train)
X_test_normal = standardScaler.fit_transform(X_test)

In [9]:
X_train_normal[:5]

array([[-1.47393679,  1.20365799, -1.56253475, -1.31260282],
       [-0.13307079,  2.99237573, -1.27600637, -1.04563275],
       [ 1.08589829,  0.08570939,  0.38585821,  0.28921757],
       [-1.23014297,  0.75647855, -1.2187007 , -1.31260282],
       [-1.7177306 ,  0.30929911, -1.39061772, -1.31260282]])

In [10]:
X_train_normal[:, 0].mean(), X_train_normal[:, 1].mean(), X_train_normal[:, 2].mean(), X_train_normal[:, 3].mean()

(1.7541523789077474e-15,
 -1.6949404842610724e-15,
 -2.294460917558657e-16,
 -2.960594732333751e-17)

In [11]:
X_train_normal[:, 0].std(), X_train_normal[:, 1].std(), X_train_normal[:, 2].std(), X_train_normal[:, 3].std()

(1.0, 0.9999999999999997, 0.9999999999999999, 1.0)

In [13]:
X_train_normal[:, :].mean(axis=0)

array([ 1.71344420e-15, -1.66579713e-15, -2.23894977e-16, -5.73615229e-17])

In [14]:
X_train_normal[:, :].std(axis=0)

array([1., 1., 1., 1.])

In [15]:
# SVM訓練
clf = svm.SVC()
clf.fit(X_train_normal, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [16]:
# 訓練データで予測
y_pred_train = clf.predict(X_train_normal)

# テストデータで予測
y_pred_test = clf.predict(X_test_normal)

In [17]:
# 訓練データの混合行列
confusion_matrix(y_train, y_pred_train)

array([[40,  0,  0],
       [ 0, 38,  3],
       [ 0,  1, 38]], dtype=int64)

In [18]:
# 訓練データの正解率
accuracy_score(y_train, y_pred_train)

0.9666666666666667

In [21]:
# 訓練データの適合率
precision_score(y_train, y_pred_train, average=None)

array([1.        , 0.97435897, 0.92682927])

In [22]:
precision_score(y_train, y_pred_train, average='macro')

0.9670627475505524

In [23]:
# 訓練データの適合率、再現率、F1値
print(classification_report(y_train, y_pred_train, target_names=['Setosa', 'Versicolour', 'Virginica']))

              precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00        40
 Versicolour       0.97      0.93      0.95        41
   Virginica       0.93      0.97      0.95        39

   micro avg       0.97      0.97      0.97       120
   macro avg       0.97      0.97      0.97       120
weighted avg       0.97      0.97      0.97       120



In [24]:
# テストデータでの評価
confusion_matrix(y_test, y_pred_test)

array([[10,  0,  0],
       [ 0,  9,  0],
       [ 0,  1, 10]], dtype=int64)

In [25]:
accuracy_score(y_test, y_pred_test)

0.9666666666666667

In [26]:
precision_score(y_test, y_pred_test, average=None)

array([1. , 0.9, 1. ])

In [27]:
precision_score(y_test, y_pred_test, average='macro')

0.9666666666666667

In [28]:
# テストデータの適合率、再現率、F1値
print(classification_report(y_test, y_pred_test, target_names=['Setosa', 'Versicolour', 'Virginica']))

              precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00        10
 Versicolour       0.90      1.00      0.95         9
   Virginica       1.00      0.91      0.95        11

   micro avg       0.97      0.97      0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



In [31]:
# テストデータの正解ラベルと予測値を確認
print(y_test.values.reshape(-1))
print(y_pred_test)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 1 0 2 2 2 2 2 0 0]
