In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install lightgbm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from lightgbm import LGBMClassifier
from lightgbm import plot_importance

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred, average = 'macro')
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, macro f1_score: {3:.4f}'.format(accuracy, precision, recall, f1))

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/신용카드AI/ADASYN/train_adasyn1.csv',index_col=0)
valid_df = pd.read_csv('/content/drive/MyDrive/신용카드AI/val.csv')
valid_df.drop(['ID'], axis=1, inplace=True)
test_df = pd.read_csv('/content/drive/MyDrive/신용카드AI/test.csv')
test_df.drop(['ID'], axis=1, inplace=True)
print(train_df.shape, valid_df.shape, test_df.shape)
print('train_smote data 이상치 비율:', train_df[train_df['Class'] == 1].shape[0] / train_df.shape[0])
print('valid data 이상치 비율:', valid_df[valid_df['Class'] == 1].shape[0] / valid_df.shape[0])

(227503, 31) (28462, 31) (142503, 30)
train_smote data 이상치 비율: 0.5000065933196486
valid data 이상치 비율: 0.0010540369615627855


In [None]:
X_train, y_train, X_valid, y_valid = train_df.iloc[:, :-1], train_df['Class'], valid_df.iloc[:, :-1], valid_df['Class']

## 단일 모델(adaboost, xgboost, lgbm) 최적 파라미터 사용용

In [None]:
base_model = DecisionTreeClassifier(max_depth=5, random_state=0)
ada_clf = AdaBoostClassifier(base_estimator = base_model, n_estimators=100, random_state=42)
lgbm_clf = LGBMClassifier(learning_rate = 0.01)
xgb_clf = XGBClassifier(n_estimators=2000, max_depth=6, subsample=0.5, learning_rate=0.1, objective='binary:logistic', tree_method='gpu_hist', gpu_id=0, random_state=0)

## Hard Voting

In [None]:
vo_clf = VotingClassifier(estimators=[('ADA', ada_clf), ('LGBM', lgbm_clf), ('XGB', xgb_clf)], voting='hard')
vo_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('ADA',
                              AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5,
                                                                                       random_state=0),
                                                 n_estimators=100,
                                                 random_state=42)),
                             ('LGBM', LGBMClassifier(learning_rate=0.01)),
                             ('XGB',
                              XGBClassifier(gpu_id=0, max_depth=6,
                                            n_estimators=2000, subsample=0.5,
                                            tree_method='gpu_hist'))])

In [None]:
vo_pred_valid = vo_clf.predict(X_valid)
get_clf_eval(y_valid, vo_pred_valid)

오차 행렬
[[28430     2]
 [    9    21]]
정확도: 0.9996, 정밀도: 0.9130, 재현율: 0.7000, macro f1_score: 0.8961


In [None]:
vo_pred_test = vo_clf.predict(test_df)

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/신용카드AI/test.csv')
test_id = test_df['ID'].values.reshape(-1, 1)
final_test_pred = pd.DataFrame(np.concatenate((test_id, vo_pred_test.reshape(-1, 1)), axis=1), columns=['ID', 'Class'])
final_test_pred.head()

Unnamed: 0,ID,Class
0,AAAA0x1,0
1,AAAA0x2,0
2,AAAA0x5,0
3,AAAA0x7,0
4,AAAA0xc,0


In [None]:
final_test_pred.to_csv('/content/drive/MyDrive/신용카드AI/ADASYN/test_pred_hard_voting.csv', index = False)

## Soft Voting

In [None]:
vo_clf = VotingClassifier(estimators=[('ADA', ada_clf), ('LGBM', lgbm_clf), ('XGB', xgb_clf)], voting='soft')
vo_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('ADA',
                              AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5,
                                                                                       random_state=0),
                                                 n_estimators=100,
                                                 random_state=42)),
                             ('LGBM', LGBMClassifier(learning_rate=0.01)),
                             ('XGB',
                              XGBClassifier(gpu_id=0, max_depth=6,
                                            n_estimators=2000, subsample=0.5,
                                            tree_method='gpu_hist'))],
                 voting='soft')

In [None]:
vo_pred_valid = vo_clf.predict(X_valid)
get_clf_eval(y_valid, vo_pred_valid)

오차 행렬
[[28430     2]
 [    9    21]]
정확도: 0.9996, 정밀도: 0.9130, 재현율: 0.7000, macro f1_score: 0.8961


In [None]:
'''vo_pred_test = vo_clf.predict(test_df)

In [None]:
'''test_df = pd.read_csv('/content/drive/MyDrive/신용카드AI/test.csv')
test_id = test_df['ID'].values.reshape(-1, 1)
final_test_pred = pd.DataFrame(np.concatenate((test_id, vo_pred_test.reshape(-1, 1)), axis=1), columns=['ID', 'Class'])
final_test_pred.head()

Unnamed: 0,ID,Class
0,AAAA0x1,0
1,AAAA0x2,0
2,AAAA0x5,0
3,AAAA0x7,0
4,AAAA0xc,0


In [None]:
'''final_test_pred.to_csv('/content/drive/MyDrive/신용카드AI/ADASYNtest_pred_soft_voting.csv', index = False)