In [27]:
import torch
import torch.nn as nn

import numpy as np
import pandas as pd

import graphviz
import pydotplus

import joblib

from sklearn import linear_model, tree, ensemble, model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay, confusion_matrix

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from torch.utils.data import DataLoader, TensorDataset

import matplotlib.pyplot as plt


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("device:{}".format(device))

device:cuda


In [114]:
data_path = '../data/creditcard.csv'


eda_df = pd.read_csv(data_path) # 데이터 불러오기
eda_df.head(10) # 개수만큼 상위 데이터 출력

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [115]:
# 데이터 확인
print("데이터 정보:")
print(eda_df.info())

데이터 정보:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V2

In [116]:
# 결측치 확인
print("결측치")
print(eda_df.isnull().sum()) # 결측이 있는 행이 true니까 결측치가 있는 행이 몇개인지
# 결측치 없음

결측치
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [117]:
# 중복 데이터 확인
print(eda_df.duplicated().sum())

# 중복 데이터 삭제
eda_df = eda_df.drop_duplicates()

print(eda_df.duplicated().sum())

1081
0


In [118]:
# 라벨 분리
label_df = eda_df['Class']
eda_df = eda_df.drop(columns=['Class', 'Time']) # 시간은 관계 없어 보이니 지운다

# 라벨 분포 확인
label_dict = label_df.value_counts()

print("클래스 분포:")
print(label_df.value_counts())

print('label shape: {}'.format(label_df.shape))

print('\nnum of negative label: {} / num of positive label: {}'.format(label_dict[0], label_dict[1])) # 0의 발생 횟수와 1의 발생횟수 count
print('% of negative label: {} / % of positive label: {}'.format(label_dict[0] / label_df.shape[0] * 100, label_dict[1] / label_df.shape[0] * 100)) # 퍼센테이지로 나타낸다.
print('sum:{}\n'.format(sum(label_df.to_numpy())))

클래스 분포:
Class
0    283253
1       473
Name: count, dtype: int64
label shape: (283726,)

num of negative label: 283253 / num of positive label: 473
% of negative label: 99.83328986416473 / % of positive label: 0.1667101358352777
sum:473



In [154]:
test_data_ratio = 0.1   # split 한 뒤에 비정상 데이터와 합칠 것이라서 일단은 적게 놓는다.
# 원래 0.2 -> 0.05 로 하니 Precision가 상승하고 ReCall이 많이 떨어짐 0.005로 하니 더 떨어짐 0.1이 그 나마 괜찮은것 같음 
random_state = 0

# anomaly detection 세팅을 위해 정상 데이터와 사기 데이터를 분리
eda_normal = eda_df[label_df == 0]
eda_unnormal = eda_df[label_df == 1]

# 학습 데이터는 정상 데이터로만 구성
train_data, test_data, train_label, test_label = train_test_split(eda_normal, label_df[label_df == 0], test_size=test_data_ratio, random_state=random_state)

# 테스트 데이터는 정상 + 사기 데이터로 구성
test_data_concat = pd.concat([test_data, eda_unnormal])
test_label_concat = pd.concat([test_label, label_df[label_df == 1]])

# 데이터 스케일링
# StandardScaler와 MinMaxScaler를 시도해보았는데, StandardScaler가 더 좋은 성능을 보여 사용.
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(train_data)
eda_unnormal_sacled = scaler.transform(test_data)
test_data_scaled = scaler.transform(test_data_concat)

scaler_min_max = MinMaxScaler()
train_data_scaled_minmax = scaler.fit_transform(train_data)
eda_unnormal_sacled_minmax = scaler.transform(test_data)
test_data_scaled_minmax = scaler.transform(test_data_concat)



print(f"Test data shape: {test_data_scaled.shape}")
print(f"Test label shape: {test_label_concat.values.shape}")

Test data shape: (28799, 29)
Test label shape: (28799,)


In [176]:
learning_rate = 5e-3    # 한번 학습을 시키는데 적용하는 정도

# 모델 정의
input_size = 29   # feature의 수
# 레이어 수를 변경해 보기, 큰 변화는 보이지 않음
encoder_hidden_size1 = 64
encoder_hidden_size2 = 32
encoder_hidden_size3 = 16
decoder_hidden_size1 = 16
decoder_hidden_size2 = 32
decoder_hidden_size3 = 64
# 드랍아웃 설정해 봤지만 변화 없음
dropout_p = 0.2
class AutoEncoder(nn.Module):
    def __init__(self, input_size, encoder_hidden_size1, encoder_hidden_size2, encoder_hidden_size3, decoder_hidden_size1, decoder_hidden_size2, decoder_hidden_size3):
        super(AutoEncoder, self).__init__()
        self.input_size = input_size
        self.encoder_hidden_size1 = encoder_hidden_size1
        self.encoder_hidden_size2 = encoder_hidden_size2
        self.encoder_hidden_size3 = encoder_hidden_size3
        self.decoder_hidden_size1 = decoder_hidden_size1
        self.decoder_hidden_size2 = decoder_hidden_size2
        self.decoder_hidden_size3 = decoder_hidden_size3

        # 인코더
        self.encoder = nn.Sequential(
            nn.Linear(input_size, encoder_hidden_size1),
            nn.ReLU(),
            nn.Linear(encoder_hidden_size1, encoder_hidden_size2),
            nn.ReLU(),
            nn.Linear(encoder_hidden_size2, encoder_hidden_size3),
            nn.ReLU(),
            
        )
        # 디코더
        self.decoder = nn.Sequential(
            nn.Linear(decoder_hidden_size1, decoder_hidden_size2),
            nn.ReLU(),
            nn.Linear(decoder_hidden_size2, decoder_hidden_size3),
            nn.ReLU(),
            nn.Linear(decoder_hidden_size3, input_size)
        )
    
    def forward(self, data):
        # data = self.prepare_input(data)
        encoder_result = self.encoder(data)
        decoder_result = self.decoder(encoder_result)

        return decoder_result
    
    def prepare_input(self, data):
        flattened_data = data.view(data.size(0), -1)

        return flattened_data


auto_encoder = AutoEncoder(input_size, encoder_hidden_size1, encoder_hidden_size2, encoder_hidden_size3, decoder_hidden_size1, decoder_hidden_size2, decoder_hidden_size3).to(device)
#  weight_decay=1e-5 -> 가중치 크기 제한, 큰 변화 없음
optimizer = torch.optim.Adam(auto_encoder.parameters(), lr=learning_rate, weight_decay=1e-5)

In [179]:

epochs = 5          # 딥러닝은 시간이 오래걸리니 일단 10, 10과 5가 별 차이가 없어서 5로 설정
batch_size = 32
criterion = nn.MSELoss(reduction='none')

class Trainer:
  def __init__(self, model, data_loader, optimizer, criterion, epochs, device):
    self.model = model
    self.data_loader = data_loader
    self.optimizer = optimizer
    self.criterion = criterion
    self.epochs = epochs
    self.device = device

  def train(self):
    self.model.train()
    
    for epoch in range(self.epochs):
      for data,_ in self.data_loader: 
        data = data[0].to(self.device)
        
        self.optimizer.zero_grad()  # 옵티마이저 초기화
        result = self.model(data)

        result, label = result.reshape(data.size(0), -1), data.reshape(data.size(0), -1)
        loss = self.criterion(result, label)

        loss = loss.mean()
        
        loss.backward()
        self.optimizer.step()
        

train_dataset = TensorDataset(torch.tensor(train_data_scaled, dtype=torch.float32), torch.tensor(train_label.values, dtype=torch.long))
test_dataset = TensorDataset(torch.tensor(test_data_scaled, dtype=torch.float32), torch.tensor(test_label_concat.values, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)

auto_encoder_trainer = Trainer(auto_encoder, train_loader, optimizer, criterion, epochs, device)
auto_encoder_trainer.train()

In [182]:
num_visualization = 5
from sklearn.metrics import roc_curve, auc
class Tester:
  def __init__(self, model, data_loader, criterion, num_visualization, device, threshold=None):
    self.model = model
    self.data_loader = data_loader
    self.criterion = criterion
    self.num_visualization = num_visualization
    self.device = device
    self.reconstructed_data = []
    self.threshold = threshold
    
  def calculate_reconstruction_error(self, data):
    # 입력 데이터에 대한 재구성 오류를 계산.
    self.model.eval()
    with torch.no_grad():
        data = data.to(self.device)
        output = self.model(data)
        output = output.view(data.size(0), -1)
        data = data.view(data.size(0), -1)
        loss = self.criterion(output, data)
        # 형태 출력
        # 배치별로 평균 손실 반환
        return loss.mean(dim=1).cpu().numpy()
      
  def determine_threshold(self, errors):
      # 재구성 오류를 기반으로 임계값을 결정.
      mean = errors.mean()
      std = errors.std()
      threshold = mean + 3 * std  # 평균 + 3 * 표준편차
      return threshold     
    
  def test(self):
    # 테스트 데이터를 사용하여 모델의 성능을 평가.
    all_labels = []
    all_errors = []

    self.model.eval()
    with torch.no_grad():
        for data, labels in self.data_loader:
            data = data.to(self.device)
            labels = labels.numpy()
            errors = self.calculate_reconstruction_error(data)
            all_labels.extend(labels)
            all_errors.extend(errors)
            if len(labels) != len(errors):
              print("data is diff")

    all_labels = np.array(all_labels)
    all_errors = np.array(all_errors)
    # 임계값을 95 백분위수 기반으로 하게 되면 ReCall은 오르지만 Precision이 지나치게 떨어짐
    # self.threshold = np.percentile(all_errors, 98.6)
    
    # ROC Curve 계산
    fpr, tpr, thresholds = roc_curve(all_labels, all_errors)
    roc_auc = auc(fpr, tpr)

    # Youden's J statistic 계산
    youden_j = tpr - fpr
    optimal_idx = np.argmax(youden_j)
    threshold = thresholds[optimal_idx]
    print(f"Optimal Threshold (ROC-based): {threshold}")
    
    # 임계값이 설정되지 않은 경우 계산
    if self.threshold is None:
        self.threshold = self.determine_threshold(all_errors)
        print(f'Calculated threshold: {self.threshold:.4f}')
    
    # 재구성 오류를 기반으로 이상치 예측
    predictions = (all_errors > self.threshold).astype(int)

    # 성능 지표 계산
    accuracy = accuracy_score(all_labels, predictions)
    precision = precision_score(all_labels, predictions, zero_division=0)
    recall = recall_score(all_labels, predictions, zero_division=0)
    f1 = f1_score(all_labels, predictions, zero_division=0)

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')


auto_encoder_tester = Tester(auto_encoder, test_loader, criterion, num_visualization, device)
auto_encoder_tester.test()

# 전부 다 해봤지만 ReCall이 오르면 Precision이 떨어지고 Precision이 오르면 ReCall이 오릅니다.

Optimal Threshold (ROC-based): 2.3415164947509766
Calculated threshold: 17.9104
Accuracy: 0.9866
Precision: 0.7000
Recall: 0.3256
F1 Score: 0.4444
