### One Class Support Vector Machine

公式ドキュメント：https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html<br>

まず，OneClassSupportVectorMachineは異常検知の手法のうち教師データなしのものである．<br>
教師なし学習とは,データそのものが持つ構造を浮かび上がらせることで,似たデータ同士にグループ化して分類する手法である．<br>

詳細については以下のリンクをみてみるといいかもしれない<br>
[異常検知のための One Class SVM](https://qiita.com/kznx/items/434d98bf1a0e39327542)<br>
[One Class SVMを用いた異常検知](https://www.slideshare.net/YutoMori2/one-class-svm)<br>
[One Class Support Vector Machine(One Class SVM入門)](https://recruit.cct-inc.co.jp/tecblog/machine-learning/one-class-svm/)

#### はじめに

wineデータセットに対してOneClassSupportVectorMachineを使用する<br>
スケーリング:標準化(平均0, 分散１)<br>
今回は学習用データとテスト用データは以下の通りにする<br>
学習用：正常データ38個<br>
テスト用：正常データ10個, 異常データ10個<br>

目的<br>
正常データをclass_0としてOCSVMで学習し, FARとFRRで評価を行う．<br>

version.など<br>
python 3.7.7<br>
scikit-learn==0.23.2<br>
pandas==1.1.5<br>

In [37]:
# 必要なパッケージを用意
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

# スケーリング
from sklearn import preprocessing

# モデル
from sklearn.svm import OneClassSVM

# warning ignore code
import warnings
warnings.filterwarnings('ignore')

# 表示範囲設定
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 100)

In [2]:
# データの用意: 詳細についてはdataset.ipynbを参照
from sklearn.datasets import load_wine
data_wine = load_wine()

df = pd.DataFrame(data_wine["data"],columns=data_wine["feature_names"])
df['target'] = data_wine['target']
df['target'] = df['target'].replace({0:'class_0', 1:'class_1', 2:'class_2'})

df_all = df.copy()

In [3]:
# target_nameに指定したラベルを正常データ
# それ以外を異常データとして分ける
# 今回はclass_2にあわせて, 学習用に正常データ38個, テスト用に正常データ10個と異常データ10個を使用する

def select_data(df_target, target_name):
    # target_nameで指定したものをdf1
    df1 = df_target[df_target['target'] == target_name]
    # target_nameで指定したもの以外をdf2
    df2 = df_target[df_target['target'] != target_name]
    
    # 正常データ(本人)
    x_normal = df1.drop('target', axis=1)
    y_normal = df1['target']
    
    # 異常データ(他人)
    x_anomaly = df2.drop('target', axis=1)
    y_anomaly = df2['target']
    
    # 
    # 今回は学習用データの数をそろえる為, train_seizeはclass_2の数からテスト用のデータ数１０個を引いたもの
    # train_size, test_seize:0.0~1.0の間で割合 or 個数
    x_train_no, x_test_no, y_train_no, y_test_no = train_test_split(x_normal, y_normal, train_size=38, test_size=10, random_state=0, shuffle=True)
    _x_train_ano, x_test_ano, _y_train_ano, y_test_ano = train_test_split(x_anomaly, y_anomaly, test_size=10, random_state=0, shuffle=True)
    
    return x_train_no, x_test_no, y_train_no, y_test_no, x_test_ano, y_test_ano

In [4]:
# class_0の部分はclass_1やclass_2でもOK
X_train_no, X_test_no, Y_train_no, Y_test_no, X_test_ano, Y_test_ano = select_data(df_all, 'class_0')

In [5]:
# 各データの確認

In [6]:
X_train_no.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0
10,14.1,2.16,2.3,18.0,105.0,2.95,3.32,0.22,2.38,5.75,1.25,3.17,1510.0
30,13.73,1.5,2.7,22.5,101.0,3.0,3.25,0.29,2.38,5.7,1.19,2.71,1285.0
41,13.41,3.84,2.12,18.8,90.0,2.45,2.68,0.27,1.48,4.28,0.91,3.0,1035.0
33,13.76,1.53,2.7,19.5,132.0,2.95,2.74,0.5,1.35,5.4,1.25,3.0,1235.0


In [7]:
X_test_no

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
26,13.39,1.77,2.62,16.1,93.0,2.85,2.94,0.34,1.45,4.8,0.92,3.22,1195.0
35,13.48,1.81,2.41,20.5,100.0,2.7,2.98,0.26,1.86,5.1,1.04,3.47,920.0
43,13.24,3.98,2.29,17.5,103.0,2.64,2.63,0.32,1.66,4.36,0.82,3.0,680.0
28,13.87,1.9,2.8,19.4,107.0,2.95,2.97,0.37,1.76,4.5,1.25,3.4,915.0
11,14.12,1.48,2.32,16.8,95.0,2.2,2.43,0.26,1.57,5.0,1.17,2.82,1280.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
34,13.51,1.8,2.65,19.0,110.0,2.35,2.53,0.29,1.54,4.2,1.1,2.87,1095.0
46,14.38,3.59,2.28,16.0,102.0,3.25,3.17,0.27,2.19,4.9,1.04,3.44,1065.0
40,13.56,1.71,2.31,16.2,117.0,3.15,3.29,0.34,2.34,6.13,0.95,3.38,795.0
22,13.71,1.86,2.36,16.6,101.0,2.61,2.88,0.27,1.69,3.8,1.11,4.0,1035.0


In [8]:
Y_train_no

4     class_0
10    class_0
30    class_0
41    class_0
33    class_0
42    class_0
48    class_0
7     class_0
14    class_0
32    class_0
49    class_0
29    class_0
37    class_0
56    class_0
18    class_0
55    class_0
27    class_0
15    class_0
5     class_0
31    class_0
16    class_0
50    class_0
20    class_0
51    class_0
8     class_0
13    class_0
25    class_0
17    class_0
58    class_0
57    class_0
52    class_0
38    class_0
1     class_0
12    class_0
45    class_0
24    class_0
6     class_0
23    class_0
Name: target, dtype: object

In [9]:
Y_test_no

26    class_0
35    class_0
43    class_0
28    class_0
11    class_0
2     class_0
34    class_0
46    class_0
40    class_0
22    class_0
Name: target, dtype: object

In [10]:
X_test_ano

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
107,12.72,1.75,2.28,22.5,84.0,1.38,1.76,0.48,1.63,3.3,0.88,2.42,488.0
153,13.23,3.3,2.28,18.5,98.0,1.8,0.83,0.61,1.87,10.52,0.56,1.51,675.0
154,12.58,1.29,2.1,20.0,103.0,1.48,0.58,0.53,1.4,7.6,0.58,1.55,640.0
67,12.37,1.17,1.92,19.6,78.0,2.11,2.0,0.27,1.04,4.68,1.12,3.48,510.0
156,13.84,4.12,2.38,19.5,89.0,1.8,0.83,0.48,1.56,9.01,0.57,1.64,480.0
81,12.72,1.81,2.2,18.8,86.0,2.2,2.53,0.26,1.77,3.9,1.16,3.14,714.0
66,13.11,1.01,1.7,15.0,78.0,2.98,3.18,0.26,2.28,5.3,1.12,3.18,502.0
69,12.21,1.19,1.75,16.8,151.0,1.85,1.28,0.14,2.5,2.85,1.28,3.07,718.0
104,12.51,1.73,1.98,20.5,85.0,2.2,1.92,0.32,1.48,2.94,1.04,3.57,672.0
148,13.32,3.24,2.38,21.5,92.0,1.93,0.76,0.45,1.25,8.42,0.55,1.62,650.0


In [11]:
Y_test_ano

107    class_1
153    class_2
154    class_2
67     class_1
156    class_2
81     class_1
66     class_1
69     class_1
104    class_1
148    class_2
Name: target, dtype: object

#### 前処理：標準化（平均０, 分散１）

In [12]:
ss = preprocessing.StandardScaler()
ss.fit(X_train_no)
x_train_ss = ss.transform(X_train_no)
x_test_no_ss = ss.transform(X_test_no)
x_test_ano_ss = ss.transform(X_test_ano)

#### ここから学習していく

In [13]:
# モデル作成
clf = OneClassSVM(nu=0.1, kernel="rbf")
# 学習
clf.fit(x_train_ss)
# 予測
pred_train = clf.predict(x_train_ss)
pred_test_no = clf.predict(x_test_no_ss)
pred_test_ano = clf.predict(x_test_ano_ss)

正常データは１， 異常データは-１として返される

In [14]:
pred_train

array([-1,  1, -1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,
        1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1,  1,  1,
       -1, -1,  1, -1])

In [15]:
# 細かい数値がほしいならこちら： 上記の結果と比較するとわかるが0より大きい場合は正常，小さい場合は異常と判断される
clf.decision_function(x_train_ss)

array([-1.91096706e-04,  1.19073425e-02, -1.03343274e-04,  2.46490272e-04,
       -1.29095892e-04,  4.27166049e-04,  1.45282705e-01, -1.64782223e-04,
        2.95095258e-04,  1.83475811e-02,  6.81258521e-02,  6.54041757e-02,
        1.29338774e-04,  9.28463951e-02, -1.74766202e-04,  3.24728609e-03,
       -2.18984962e-04,  8.93715758e-03,  1.72645023e-01,  8.46077095e-02,
        1.75416235e-02, -3.34936853e-04, -4.98878401e-06,  2.02209614e-03,
       -9.94730713e-05,  1.72508348e-04, -5.95616636e-05,  5.58880015e-02,
       -3.95919791e-04,  1.71174364e-01, -4.23342166e-05, -2.52081104e-04,
        5.14124531e-04,  1.52212224e-01, -4.99215539e-06, -5.09578181e-05,
        4.60093715e-04, -1.74929393e-05])

In [16]:
pred_test_no

array([ 1,  1, -1,  1, -1, -1,  1, -1,  1,  1])

In [17]:
clf.decision_function(x_test_no_ss)

array([ 0.06613869,  0.17975882, -0.15624108,  0.07691753, -0.05164406,
       -0.01967554,  0.11144721, -0.11273372,  0.01040537,  0.03348188])

In [18]:
pred_test_ano

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

In [19]:
clf.decision_function(x_test_ano_ss)

array([-0.50410585, -0.52897753, -0.52879085, -0.43458496, -0.52798063,
       -0.15340394, -0.37411381, -0.52321153, -0.35162655, -0.5270271 ])

In [20]:
Y_test_ano

107    class_1
153    class_2
154    class_2
67     class_1
156    class_2
81     class_1
66     class_1
69     class_1
104    class_1
148    class_2
Name: target, dtype: object

In [22]:
# もし標準化しなかった場合
# モデル作成
clf2 = OneClassSVM(nu=0.1, kernel="rbf")
# 学習
clf2.fit(X_train_no)
# 予測
pred_train2 = clf.predict(X_train_no)
pred_test_no2 = clf.predict(X_test_no)
pred_test_no2  = clf.predict(X_test_ano)

In [23]:
pred_train2

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1])

In [24]:
pred_test_no2 

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

In [25]:
pred_test_no2 

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

In [None]:
# 全部偽物判定されてしまいアウト

#### 評価について

[混同行列(Confusion Matrix) とは 〜 2 値分類の機械学習のクラス分類について](https://qiita.com/TsutomuNakamura/items/a1a6a02cb9bb0dcbb37f)<br>

|  | 予測(Positive) | 予測（Negative） |
| :--- | :---: | :---: |
| 実際(Positive) | TP(True Positive) | FN(False Negative) |
| 実際(Negative) | FP(False Positive) | TN(True Negative) |<br>
<br>

・真陽性（TP: True Positive）: 実際のクラスがPositiveで予測もPositive（正解）<br>
・真陰性（TN: True Negative）: 実際のクラスがNegativeで予測もNegative（正解）<br>
・偽陽性（FP: False Positive）: 実際のクラスはNegativeで予測がPositive（不正解）<br>
・偽陰性（FN: False Negative）: 実際のクラスはPositiveで予測がNegative（不正解）<br>

他人受入率;誤受理率（False Acceptance Rate; FAR） ＝ FP / (TN + FP) =  FPR<br>
他人と判断すべきなのに本人と判断した<br>

本人拒否率;誤棄却率（False Rejection Rate; FRR） ＝ FN / (FN + TP) = FNR<br>
本人と判断すべきなのに他人と判断した<br>

In [26]:
# 評価用関数
def far_frr(normal_result, anomaly_result):
    tp = np.count_nonzero(normal_result == 1)
    fn = np.count_nonzero(normal_result == -1)
    fp = np.count_nonzero(anomaly_result == 1)
    tn = np.count_nonzero(anomaly_result == -1)
    re_accuracy = (tp + tn) / (tp + fn + fp + tn)
    re_far = fp / (tn + fp)
    re_frr = fn / (fn + tp)

    # accuracy = ((TP+TN)/(TP+FN+FP+TN))
    # print(accuracy)
    return re_far, re_frr, re_accuracy

In [27]:
far, frr, accuracy= far_frr(pred_test_no, pred_test_ano)

In [28]:
far

0.0

In [29]:
frr

0.4

In [30]:
accuracy

0.8

In [None]:
# 異常データはきちんと弾いたが一部正常データも弾いた

In [31]:
# プログレスバーの表示
from tqdm import tqdm

In [None]:
# ハイパーパラメータいじるならこんな感じ(ほんとはもっといい方法があったが気がするけどかんたんに試すならこれでいい気がする)

In [32]:
from sklearn.metrics import accuracy_score
def test(nu, gamma):
    # モデル作成
    clf = OneClassSVM(nu=nu, kernel="rbf", gamma=gamma)
    # 学習
    clf.fit(x_train_ss)
    # 予測
    pred_train = clf.predict(x_train_ss)
    pred_test_no = clf.predict(x_test_no_ss)
    pred_test_ano = clf.predict(x_test_ano_ss)

    far, frr, accuracy = far_frr(pred_test_no, pred_test_ano)
#     print(f'contamination={nu} and gamma={gamma}')
#     print(f'FAR:{far}\nFRR{frr}\nAccuracy{accuracy}\n')
    return far, frr, accuracy

In [33]:
contamination=0.1
lst=[]
for nu in tqdm(np.linspace(0.01, 1, 100)):
    for gamma in np.linspace(0.01, 1, 50):
        far, frr, accuracy = test(nu, gamma)
        lst.append([nu, gamma, far, frr, accuracy])

100%|██████████| 100/100 [00:02<00:00, 33.93it/s]


In [34]:
result = pd.DataFrame(lst)

In [35]:
result.columns = ['nu', 'gamma', 'far', 'frr', 'accuracy']

In [38]:
result

Unnamed: 0,nu,gamma,far,frr,accuracy
0,0.01,0.01,0.0,0.1,0.95
1,0.01,0.030204,0.0,0.1,0.95
2,0.01,0.050408,0.0,0.3,0.85
3,0.01,0.070612,0.0,0.3,0.85
4,0.01,0.090816,0.0,0.5,0.75
5,0.01,0.11102,0.0,0.5,0.75
6,0.01,0.131224,0.0,0.7,0.65
7,0.01,0.151429,0.0,0.8,0.6
8,0.01,0.171633,0.0,0.8,0.6
9,0.01,0.191837,0.0,0.8,0.6


In [45]:
# 結果を書き出したいとき
# result.to_csv('result.csv')