In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [4]:
def get_std_scaled(df=None):
    df_copy = df.copy()
    std_scaler = StandardScaler()
    df_copy = std_scaler.fit_transform(df_copy)
    return df_copy

def get_minmax_scaled(df=None):
    df_copy = df.copy()
    std_scaler = MinMaxScaler()
    df_copy = std_scaler.fit_transform(df_copy)
    return df_copy

In [5]:
def get_tsne(df=None, ncomponents=2):
    df_copy = np.copy(df)
    tsne = TSNE(n_components = ncomponents, learning_rate=100, n_iter=2000)
    df_copy = tsne.fit_transform(df_copy)
    return df_copy

In [6]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, f1_score: {3:.4f}'.format(accuracy, precision, recall, f1))

In [7]:
train_df = pd.read_csv('../EDA&FE/train.csv')

In [8]:
train_df.drop(['ID'], axis=1, inplace=True)

In [9]:
valid_df = pd.read_csv('../EDA&FE/val.csv')

In [10]:
valid_df.drop(['ID'], axis=1, inplace=True)

In [11]:
valid_df[valid_df['Class'] == 1].shape[0] / valid_df.shape[0]

0.0010540369615627855

## 기본 parameter를 사용한 LOF

In [12]:
lof_clf = LocalOutlierFactor(n_neighbors=30, novelty=True, contamination='auto')
lof_clf.fit(train_df)

LocalOutlierFactor(n_jobs=-1, n_neighbors=30, novelty=True)

In [13]:
valid_df.iloc[:, :-1].head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,-0.366846,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,-0.255991,-0.994878
1,0.962496,0.328461,-0.171479,2.109204,1.129566,1.696038,0.107712,0.521502,-1.191311,0.724396,...,0.143997,0.402492,-0.048508,-1.371866,0.390814,0.199964,0.016371,-0.014605,0.168937,-0.994784
2,1.145524,0.575068,0.194008,2.598192,-0.09221,-1.04443,0.531588,-0.241888,-0.896287,0.757952,...,0.011106,-0.119703,-0.07651,0.69132,0.633984,0.048741,-0.053192,0.016251,0.169496,-0.994502
3,0.92706,-0.323684,0.387585,0.544474,0.246787,1.650358,-0.427576,0.615371,0.226278,-0.225495,...,-0.040513,0.079359,0.096632,-0.992569,0.085096,0.377447,0.036096,-0.00596,0.331307,-0.994467
4,-3.005237,2.600138,1.483691,-2.418473,0.306326,-0.824575,2.065426,-1.829347,4.009259,6.051521,...,-0.852309,-0.181268,-0.163747,0.515821,0.136318,0.460054,-0.251259,-1.105751,-0.287012,-0.994373


In [14]:
valid_df.values

array([[-0.33826175,  1.11959338,  1.04436655, ..., -0.25599106,
        -0.99487776,  0.        ],
       [ 0.96249607,  0.32846103, -0.17147905, ...,  0.16893733,
        -0.99478377,  0.        ],
       [ 1.14552439,  0.57506797,  0.19400846, ...,  0.16949626,
        -0.99450182,  0.        ],
       ...,
       [-0.4469509 ,  1.30221237, -0.16858299, ...,  0.53797247,
         1.0347631 ,  0.        ],
       [ 2.03955977, -0.1752331 , -1.19682531, ..., -0.26996437,
         1.03492757,  0.        ],
       [-0.24044005,  0.53048251,  0.70251023, ..., -0.16767973,
         1.03497457,  0.        ]])

In [15]:
pred_val = lof_clf.predict(valid_df.iloc[:, :-1])
pred_val[:30]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1])

In [16]:
valid_df['Class'].head(30)

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
Name: Class, dtype: int64

In [18]:
pred_val_edit = np.where(pred_val==1, 0, 1)

In [19]:
pred_val_edit[:30]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
print(accuracy_score(valid_df['Class'], pred_val_edit))
print(precision_score(valid_df['Class'], pred_val_edit))
print(recall_score(valid_df['Class'], pred_val_edit))
print(confusion_matrix(valid_df['Class'], pred_val_edit))

0.9552736982643525
0.005568814638027049
0.23333333333333334
[[27182  1250]
 [   23     7]]


In [21]:
f1 = f1_score(valid_df['Class'], pred_val_edit)
print('Default parameter를 사용한 f1 score: {}'.format(f1))

Default parameter를 사용한 f1 score: 0.010878010878010878


In [22]:
get_clf_eval(valid_df['Class'], pred_val_edit)

오차 행렬
[[27182  1250]
 [   23     7]]
정확도: 0.9553, 정밀도: 0.0056, 재현율: 0.2333, f1_score: 0.0109


## Scaling을 거친 뒤 PCA 를 통한 차원 축소 후 LOF 적용

In [10]:
train_df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,4.983721,-0.994972
1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,1.418291,-0.994972
2,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,-0.371407,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,-0.256131,-0.99496
3,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,1.249376,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,0.262698,-0.994901
4,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,-0.41043,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,0.9949,-0.994901


In [11]:
valid_df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,V29,V30,Class
0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,-0.366846,...,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,-0.255991,-0.994878,0
1,0.962496,0.328461,-0.171479,2.109204,1.129566,1.696038,0.107712,0.521502,-1.191311,0.724396,...,0.402492,-0.048508,-1.371866,0.390814,0.199964,0.016371,-0.014605,0.168937,-0.994784,0
2,1.145524,0.575068,0.194008,2.598192,-0.09221,-1.04443,0.531588,-0.241888,-0.896287,0.757952,...,-0.119703,-0.07651,0.69132,0.633984,0.048741,-0.053192,0.016251,0.169496,-0.994502,0
3,0.92706,-0.323684,0.387585,0.544474,0.246787,1.650358,-0.427576,0.615371,0.226278,-0.225495,...,0.079359,0.096632,-0.992569,0.085096,0.377447,0.036096,-0.00596,0.331307,-0.994467,0
4,-3.005237,2.600138,1.483691,-2.418473,0.306326,-0.824575,2.065426,-1.829347,4.009259,6.051521,...,-0.181268,-0.163747,0.515821,0.136318,0.460054,-0.251259,-1.105751,-0.287012,-0.994373,0


In [12]:
std_scaler = StandardScaler()
train_std_scaled = std_scaler.fit_transform(train_df)

In [13]:
pca = PCA(n_components=6)
pca.fit(train_std_scaled)

PCA(n_components=6)

In [14]:
print(pca.explained_variance_ratio_)

[0.06513832 0.05646574 0.03714042 0.03564491 0.03501752 0.03454549]


In [17]:
train_std_scaled[:5, :]

array([[-0.69631739, -0.81248098,  1.17808885,  0.27179773, -0.36830933,
         1.3493081 ,  0.65251086,  0.21098825, -1.38080424,  0.18895482,
         0.61712589,  0.06359992,  0.71891324, -0.18242933,  2.56248084,
        -3.34488404,  1.36859077, -0.14742379, -2.77722481,  0.68841026,
         0.34520952,  1.06611136,  1.43183424, -1.13624638, -0.63322973,
        -0.28858626, -0.13796888, -0.16635528,  1.18856304, -1.99083856],
       [-0.49535792, -0.11296695,  1.19130541, -0.60817334, -0.00788567,
         0.93347629,  0.19254119,  0.32044434, -1.26429059, -0.05615561,
        -0.22154762,  0.17838038,  0.50830405, -0.31306164, -0.68902932,
        -1.2285273 , -0.85027688,  2.36489698, -1.51364446, -0.27188785,
        -0.14828014,  0.00728516, -0.29714757, -1.93890877,  1.2415625 ,
        -0.46080316,  0.15758692,  0.17399831,  0.14387631, -1.99083856],
       [-0.21842739,  0.58098215,  0.75581873, -0.11615442,  0.30749794,
        -0.02620571,  0.39068995,  0.22164921, -0

In [18]:
lof_clf = LocalOutlierFactor(n_neighbors=20, novelty=True)
lof_clf.fit(train_std_scaled)

LocalOutlierFactor(n_jobs=-1, novelty=True)

In [23]:
pre_val = lof_clf.predict(valid_df.iloc[:, :-1])

In [30]:
pre_val[:20]

array([1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0])

In [26]:
pre_val = np.where(pre_val==1, 0, 1)

In [28]:
f1 = f1_score(valid_df['Class'], pre_val)
print('Default parameter를 사용한 f1 score: {}'.format(f1))

Default parameter를 사용한 f1 score: 0.0017052115528082704


## Scaling, TSNE 차원 축소 후 LOF 진행

In [13]:
train_minmax_scaled = get_minmax_scaled(df=train_df)

In [36]:
train_tsne = get_tsne(df=train_minmax_scaled, ncomponents=2)

KeyboardInterrupt: 

In [None]:
train_tsne.embedding_

In [None]:
train_tsne.feature_names_in

## GridSearchCV를 활용하여 최적 파라미터 찾기

In [14]:
params = {'n_neighbors' : [20, 30, 40, 50], 'p' : [2, 3, 4], 'contamination' : ['auto', 0.00001, 0.00005, 0.0001, 0.0005, 0.001]}

In [16]:
lof_clf = LocalOutlierFactor(novelty=True)
grid_cv = GridSearchCV(lof_clf, param_grid=params, scoring='f1_macro', cv=3, verbose=1)
grid_cv.fit(train_minmax_scaled)
print('최고 평균 정확도 수치: {0:.4f}'.format(grid_cv.best_score_))
print('최적 하이퍼 파라미터:', grid_cv.best_params_)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


KeyboardInterrupt: 

In [None]:
scores_df = pd.DataFrame(gird_cv.cv_results_)
socres_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

In [None]:
best_clf = grid_cv.best_estimator_
best_pred = best_clf.predict(valid_df.iloc[:, :-1])
best_pred_edit = np.where(best_pred == -1, 1, 0)
f1 = f1_score(valid_df['Class'], best_pred_edit)
print('최적의 파라미터를 사용한 f1 score: {0:.4f}'.format(f1))