In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_selection import *
from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, precision_recall_curve, accuracy_score,roc_auc_score


In [2]:
from google.colab import drive
drive.mount('/content/drive')

bankruptcy = '/content/drive/MyDrive/data.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# 데이터로드
df = pd.read_csv(bankruptcy)
df
x=df.drop(['budo'],axis=1)
y=df['budo']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, shuffle=True, stratify=y, random_state=42 )

In [4]:
# 클래스 불균형 확인
y_train.value_counts()

0    43250
1     2623
Name: budo, dtype: int64

# UnderSampling

In [5]:
from imblearn.under_sampling import NearMiss
# 2 는 모든 소수 클래스 샘플까지의 평균 거리 활용

NM_model = NearMiss(version = 2)

x_train_u, y_train_u = NM_model.fit_resample(x_train, y_train)
x_train_u = pd.DataFrame(x_train_u, columns = x.columns)
y_train_u = pd.Series(y_train_u)


# NM_model = NearMiss(version = 2, sampling_strategy={1:y_train_u.value_counts().iloc[-1], 0:y_train_u.value_counts().iloc[-1]*5})

# x_train_u, y_train_u = NM_model.fit_resample(x_train, y_train)
# x_train_u = pd.DataFrame(x_train_u, columns = x.columns)
# y_train_u = pd.Series(y_train_u)


In [6]:
y_train_u.value_counts()

0    2623
1    2623
Name: budo, dtype: int64

# 스케일링 없이

In [7]:
lgb = LGBMClassifier()
lgb.fit(x_train_u, y_train_u)
lgb.predict(x_test)
lgb.score(x_test, y_test)

0.12136317395727365

precision_score, recall_score, f1_score

In [8]:
pred = lgb.predict(x_test)
f1 = f1_score(y_test, pred)
recall = recall_score(y_test, pred)
pre = precision_score(y_test, pred)
print("f1 score : {:.3f}".format(f1))
print("recall score : {:.3f}".format(recall))
print("precision score : {:.3f}".format(pre))

f1 score : 0.109
recall score : 0.940
precision score : 0.058


# MinMax

In [9]:
# 데이터로드
df = pd.read_csv(bankruptcy)
df
x=df.drop(['budo'],axis=1)
y=df['budo']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, shuffle=True, stratify=y, random_state=42 )

In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_mms = scaler.transform(x_train)
x_test_mms = scaler.transform(x_test)



In [11]:
NM_model = NearMiss(version = 2)

x_train_u, y_train_u = NM_model.fit_resample(x_train_mms, y_train)
x_train_u = pd.DataFrame(x_train_u, columns = x.columns)
y_train_u = pd.Series(y_train_u)


In [12]:
lgb.fit(x_train_u, y_train_u)
lgb.predict(x_test_mms)
lgb.score(x_test_mms, y_test)

0.21612410986775177

In [13]:
pred = lgb.predict(x_test_mms)
f1_MM = f1_score(y_test, pred)
recall_MM = recall_score(y_test, pred)
pre_MM = precision_score(y_test, pred)
print("f1 score : {:.3f}".format(f1_MM))
print("recall score : {:.3f}".format(recall_MM))
print("precision score : {:.3f}".format(pre_MM))

f1 score : 0.114
recall score : 0.878
precision score : 0.061


# Standard

In [14]:
# 데이터로드
df = pd.read_csv(bankruptcy)
df
x=df.drop(['budo'],axis=1)
y=df['budo']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, shuffle=True, stratify=y, random_state=42 )

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)
x_train_st = scaler.transform(x_train)
x_test_st = scaler.transform(x_test)


In [16]:
NM_model = NearMiss(version = 2)

x_train_u, y_train_u = NM_model.fit_resample(x_train_st, y_train)
x_train_u = pd.DataFrame(x_train_u, columns = x.columns)
y_train_u = pd.Series(y_train_u)

In [17]:
lgb.fit(x_train_u, y_train_u)
lgb.predict(x_test_st)
lgb.score(x_test_st, y_test)

0.206205493387589

In [18]:
pred = lgb.predict(x_test_mms)
f1_S = f1_score(y_test, pred)
recall_S = recall_score(y_test, pred)
pre_S = precision_score(y_test, pred)
print("f1 score : {:.3f}".format(f1_S))
print("recall score : {:.3f}".format(recall_S))
print("precision score : {:.3f}".format(pre_S))

f1 score : 0.105
recall score : 0.961
precision score : 0.056


# Robust

In [19]:
# 데이터로드
df = pd.read_csv(bankruptcy)
df
x=df.drop(['budo'],axis=1)
y=df['budo']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, shuffle=True, stratify=y, random_state=42 )

In [20]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaler.fit(x_train)
x_train_rb = scaler.transform(x_train)
x_test_rb = scaler.transform(x_test)


In [21]:
NM_model = NearMiss(version = 2)

x_train_u, y_train_u = NM_model.fit_resample(x_train_rb, y_train)
x_train_u = pd.DataFrame(x_train_u, columns = x.columns)
y_train_u = pd.Series(y_train_u)

In [22]:
lgb.fit(x_train_u, y_train_u)
lgb.predict(x_test_rb)
lgb.score(x_test_rb, y_test)

0.11414038657171922

In [23]:
pred = lgb.predict(x_test_rb)
f1_R = f1_score(y_test, pred)
recall_R = recall_score(y_test, pred)
pre_R = precision_score(y_test, pred)
print("f1 score : {:.3f}".format(f1_R))
print("recall score : {:.3f}".format(recall_R))
print("precision score : {:.3f}".format(pre_R))

f1 score : 0.106
recall score : 0.919
precision score : 0.056


# Normalizer

In [24]:
# 데이터로드
df = pd.read_csv(bankruptcy)
df
x=df.drop(['budo'],axis=1)
y=df['budo']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, shuffle=True, stratify=y, random_state=42 )

In [25]:
from sklearn.preprocessing import Normalizer

scaler = Normalizer()
scaler.fit(x_train)
x_train_N = scaler.transform(x_train)
x_test_N = scaler.transform(x_test)


In [26]:
NM_model = NearMiss(version = 2)

x_train_u, y_train_u = NM_model.fit_resample(x_train_N, y_train)
x_train_u = pd.DataFrame(x_train_u, columns = x.columns)
y_train_u = pd.Series(y_train_u)

In [27]:
lgb.fit(x_train_u, y_train_u)
lgb.predict(x_test_N)
lgb.score(x_test_N, y_test)

0.14730417090539166

In [28]:
pred = lgb.predict(x_test_N)
f1_N = f1_score(y_test, pred)
recall_N = recall_score(y_test, pred)
pre_N = precision_score(y_test, pred)
print("f1 score : {:.3f}".format(f1_N))
print("recall score : {:.3f}".format(recall_N))
print("precision score : {:.3f}".format(pre_N))

f1 score : 0.111
recall score : 0.931
precision score : 0.059


# 총 정리


In [29]:
print("스케일링 없이")
print("f1 score : {:.3f}".format(f1))
print("recall score : {:.3f}".format(recall))
print("precision score : {:.3f}".format(pre))
print("")
print("MinMax Scaler")
print("f1 score : {:.3f}".format(f1_MM))
print("recall score : {:.3f}".format(recall_MM))
print("precision score : {:.3f}".format(pre_MM))
print("")
print("Standard Scaler")
print("f1 score : {:.3f}".format(f1_S))
print("recall score : {:.3f}".format(recall_S))
print("precision score : {:.3f}".format(pre_S))
print("")
print("Robust Scaler")
print("f1 score : {:.3f}".format(f1_R))
print("recall score : {:.3f}".format(recall_R))
print("precision score : {:.3f}".format(pre_R))
print("")
print("Normalizer")
print("f1 score : {:.3f}".format(f1_N))
print("recall score : {:.3f}".format(recall_N))
print("precision score : {:.3f}".format(pre_N))

스케일링 없이
f1 score : 0.109
recall score : 0.940
precision score : 0.058

MinMax Scaler
f1 score : 0.114
recall score : 0.878
precision score : 0.061

Standard Scaler
f1 score : 0.105
recall score : 0.961
precision score : 0.056

Robust Scaler
f1 score : 0.106
recall score : 0.919
precision score : 0.056

Normalizer
f1 score : 0.111
recall score : 0.931
precision score : 0.059
