In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/dacon_anomaly')

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [None]:
train_df = pd.read_csv('./data/train.csv') # Train
train_x = train_df.drop(columns=['ID'])

val_df = pd.read_csv('./data/val.csv') # Validation
val_x = val_df.drop(columns=['ID', 'Class']) # Input Data
val_y = val_df['Class'] # Label

val_normal, val_fraud = val_df['Class'].value_counts()
val_contamination = val_fraud / val_normal

In [None]:
# Singular Value Decomposition

from sklearn.decomposition import TruncatedSVD

n_components = 20
algorithm = 'randomized'
random_state = 42

svd = TruncatedSVD(n_components=n_components, algorithm=algorithm, random_state=random_state)

X_train_svd = svd.fit_transform(train_x)
X_train_svd = pd.DataFrame(data=X_train_svd)

X_valid_svd = svd.transform(val_x)
X_valid_svd = pd.DataFrame(data=X_valid_svd)

In [None]:
# LocalOutlierFactor 

from sklearn.neighbors import LocalOutlierFactor

# 가설 설정 : Train dataset도 Validation dataset과 동일한 비율로 사기거래가 발생 했을 것이다. -> model parameter : contamination=val_contamination(=0.001055) 적용
lof = LocalOutlierFactor(n_neighbors=20, contamination=val_contamination, novelty=True)
lof.fit(X_train_svd)

LocalOutlierFactor(contamination=0.0010551491277433877, novelty=True)

In [None]:
### evaluation : validation set

def get_pred_label(model_pred):
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

val_pred =  lof.predict(X_valid_svd) #model prediction
val_pred = get_pred_label(val_pred)
val_score = f1_score(val_y, val_pred, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_y, val_pred))

Validation F1 Score : [0.49950763170851803]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.00      0.00      0.00        30

    accuracy                           1.00     28462
   macro avg       0.50      0.50      0.50     28462
weighted avg       1.00      1.00      1.00     28462



In [None]:
test_df = pd.read_csv('./data/test.csv') # Train
test_x = test_df.drop(columns=['ID'])

In [None]:
X_test_svd = svd.transform(test_x)
X_test_svd = pd.DataFrame(data=X_test_svd)

In [None]:
test_pred = lof.predict(X_test_svd) # model prediction
test_pred = get_pred_label(test_pred)

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')

In [None]:
submit['Class'] = test_pred
submit.to_csv('submit_SVDLocalOutlierFactor.csv', index=False)

In [None]:
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.003, novelty=True)
lof.fit(train_x)

LocalOutlierFactor(contamination=0.003, novelty=True)

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

val_pred =  lof.predict(val_x) #model prediction
val_pred = get_pred_label(val_pred)
val_score = f1_score(val_y, val_pred, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_y, val_pred))

  "X does not have valid feature names, but"


Validation F1 Score : [0.4989701973348355]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.00      0.00      0.00        30

    accuracy                           1.00     28462
   macro avg       0.50      0.50      0.50     28462
weighted avg       1.00      1.00      1.00     28462

