# 야호!

In [1]:
from imblearn.over_sampling import KMeansSMOTE
import os
import pandas as pd
from glob import glob
from tqdm import tqdm
from tqdm.contrib import tzip
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle 

In [2]:
import os
from pathlib import Path

def data_path(address_file):
    data_path = str(Path(os.getcwd()).parent) + '/1. scraping/data/' + address_file
    return Path(data_path)

In [3]:
x = pd.read_csv(data_path('dataset_financial.csv'), index_col = 0).dropna().iloc[:,1:]
y = pd.read_csv(data_path('biz_status.csv'), index_col = 0).iloc[:,1:]

In [4]:
x['결산년도'] = x['결산년도'].str[:4].astype('int')
x = x[x['결산년도'] <= 2019]


y['폐업일자'] = y['폐업일자'].str[:4].astype('float')
y = y[['사업자등록번호', '폐업일자']]

db = pd.merge(x, y)
db['1년뒤_폐업여부'] = db['폐업일자']-1 <= db['결산년도']
db['2년뒤_폐업여부'] = db['폐업일자']-2 <= db['결산년도']
db['3년뒤_폐업여부'] = db['폐업일자']-3 <= db['결산년도']
db['4년뒤_폐업여부'] = db['폐업일자']-4 <= db['결산년도']
db['5년뒤_폐업여부'] = db['폐업일자']-5 <= db['결산년도']

db

Unnamed: 0,결산년도,총자산,자본금,자본총계,매출액,영업이익,당기순이익,사업자등록번호,폐업일자,1년뒤_폐업여부,2년뒤_폐업여부,3년뒤_폐업여부,4년뒤_폐업여부,5년뒤_폐업여부
0,2019,11322266.0,1410020.0,10670783.0,2311466.0,-26805.0,278091.0,221-81-48293,,False,False,False,False,False
1,2018,10791503.0,1410020.0,10392692.0,3042851.0,445686.0,400916.0,221-81-48293,,False,False,False,False,False
2,2017,10434592.0,1410020.0,9991776.0,2888489.0,121397.0,-1226084.0,221-81-48293,,False,False,False,False,False
3,2016,11630780.0,1410020.0,11217860.0,2246411.0,-115412.0,164513.0,221-81-48293,,False,False,False,False,False
4,2019,10000.0,10000.0,10000.0,0.0,0.0,0.0,507-87-01423,,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297563,2016,36382026.0,2000000.0,23714777.0,21237685.0,2740242.0,2322011.0,301-81-27032,,False,False,False,False,False
297564,2019,10142990.0,150000.0,5720398.0,83810901.0,-198279.0,-243281.0,315-81-32820,,False,False,False,False,False
297565,2018,11527759.0,150000.0,7763679.0,71428798.0,594055.0,387409.0,315-81-32820,,False,False,False,False,False
297566,2017,11928022.0,150000.0,7376269.0,62967410.0,428346.0,302429.0,315-81-32820,,False,False,False,False,False


In [5]:
db['5년뒤_폐업여부'].value_counts()

False    272166
True      25402
Name: 5년뒤_폐업여부, dtype: int64

In [6]:
X = db.iloc[:,:7]
X_1 = X[X['결산년도'] <= 2018]
X_2 = X[X['결산년도'] <= 2017]
X_3 = X[X['결산년도'] <= 2016]
X_4 = X[X['결산년도'] <= 2015]
X_5 = X[X['결산년도'] <= 2014]

y_1 = db.iloc[X_1.index,9]
y_2 = db.iloc[X_2.index,10]
y_3 = db.iloc[X_3.index,11]
y_4 = db.iloc[X_4.index,12]
y_5 = db.iloc[X_5.index,13]

In [7]:
# 모델 학습
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, average_precision_score
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

model = LGBMClassifier(n_estimators=1000)

NM_model = RandomUnderSampler()

X_1, y_1 = NM_model.fit_resample(X_1, y_1)
X_2, y_2 = NM_model.fit_resample(X_2, y_2)
X_3, y_3 = NM_model.fit_resample(X_3, y_3)
X_4, y_4 = NM_model.fit_resample(X_4, y_4)
X_5, y_5 = NM_model.fit_resample(X_5, y_5)

In [8]:
pipe = Pipeline([('model', model)])

In [9]:
def print_score(y_test, y_pred, y_pred_proba):
    print('ACC:', accuracy_score(y_test, y_pred))
    print('REC:', recall_score(y_test, y_pred))
    print('PRE:', precision_score(y_test, y_pred))
    print('F1:', f1_score(y_test, y_pred))
    print('AP:', average_precision_score(y_test, y_pred_proba))
    print('ROC_AUC:', roc_auc_score(y_test, y_pred_proba))
    
def make_score(model, X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    model.fit(X_train, y_train);
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]
    print_score(y_test, y_pred, y_pred_proba)
    return model

In [10]:
make_score(pipe, X_1, y_1)

ACC: 0.6509519492293744
REC: 0.6753731343283582
PRE: 0.631762652705061
F1: 0.6528403967538323
AP: 0.6862701609402112
ROC_AUC: 0.706813156440022


In [11]:
make_score(pipe, X_2, y_2)

ACC: 0.6483221476510067
REC: 0.6696185286103542
PRE: 0.6358344113842174
F1: 0.6522893165228931
AP: 0.6960243995944817
ROC_AUC: 0.7051381860646166


In [12]:
make_score(pipe, X_3, y_3)

ACC: 0.6720921000396983
REC: 0.6929444225463145
PRE: 0.6681870011402509
F1: 0.6803405572755419
AP: 0.7185106866457854
ROC_AUC: 0.7352217173832083


In [13]:
make_score(pipe, X_4, y_4)

ACC: 0.6573542736333434
REC: 0.698330804248862
PRE: 0.6434563758389261
F1: 0.6697715034201718
AP: 0.6910740579186172
ROC_AUC: 0.7199499382666217


In [14]:
make_score(pipe, X_5, y_5)

ACC: 0.6619385342789598
REC: 0.709601259181532
PRE: 0.6483700862895494
F1: 0.6776052104208417
AP: 0.7102336185772935
ROC_AUC: 0.7292038679592616


In [15]:
import pickle

pipe.fit(X_1, y_1)
with open('model_1.pkl','wb') as pickle_file: pickle.dump(pipe, pickle_file)

pipe.fit(X_2, y_2)
with open('model_2.pkl','wb') as pickle_file: pickle.dump(pipe, pickle_file)

pipe.fit(X_3, y_3)
with open('model_3.pkl','wb') as pickle_file: pickle.dump(pipe, pickle_file)

pipe.fit(X_4, y_4)
with open('model_4.pkl','wb') as pickle_file: pickle.dump(pipe, pickle_file)

pipe.fit(X_5, y_5)
with open('model_5.pkl','wb') as pickle_file: pickle.dump(pipe, pickle_file)

In [16]:
with open('model_1.pkl','rb') as pickle_file: model_1 = pickle.load(pickle_file)
with open('model_2.pkl','rb') as pickle_file: model_2 = pickle.load(pickle_file)
with open('model_3.pkl','rb') as pickle_file: model_3 = pickle.load(pickle_file)
with open('model_4.pkl','rb') as pickle_file: model_4 = pickle.load(pickle_file)
with open('model_5.pkl','rb') as pickle_file: model_5 = pickle.load(pickle_file)

In [17]:
X

Unnamed: 0,결산년도,총자산,자본금,자본총계,매출액,영업이익,당기순이익
0,2019,11322266.0,1410020.0,10670783.0,2311466.0,-26805.0,278091.0
1,2018,10791503.0,1410020.0,10392692.0,3042851.0,445686.0,400916.0
2,2017,10434592.0,1410020.0,9991776.0,2888489.0,121397.0,-1226084.0
3,2016,11630780.0,1410020.0,11217860.0,2246411.0,-115412.0,164513.0
4,2019,10000.0,10000.0,10000.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
297563,2016,36382026.0,2000000.0,23714777.0,21237685.0,2740242.0,2322011.0
297564,2019,10142990.0,150000.0,5720398.0,83810901.0,-198279.0,-243281.0
297565,2018,11527759.0,150000.0,7763679.0,71428798.0,594055.0,387409.0
297566,2017,11928022.0,150000.0,7376269.0,62967410.0,428346.0,302429.0


In [18]:
def print_proba(X_data):
      X_test = X_data.values.reshape(1,-1)
      y_pred_1 = model_1.predict_proba(X_test)[0][1]
      y_pred_2 = model_2.predict_proba(X_test)[0][1]
      y_pred_3 = model_3.predict_proba(X_test)[0][1]
      y_pred_4 = model_4.predict_proba(X_test)[0][1]
      y_pred_5 = model_5.predict_proba(X_test)[0][1]
      
      print(f'해당 기업의 예상 폐업률은 \n1년 내 {np.round(y_pred_1*100, 1)}%, \n2년 내 {np.round(y_pred_2*100, 1)}%, \n3년 내 {np.round(y_pred_3*100, 1)}%, \n4년 내 {np.round(y_pred_4*100, 1)}%, \n5년 내 {np.round(y_pred_5*100,1)}% 입니다.')
      if y_pred_1 > 0.5:            
            print(f'1년 내 폐업이 예상되오니 1년 미만 단기계약을 포함한 모든 계약에 신중하시기 바랍니다.')
      elif y_pred_2 > 0.5:
            print(f'2년 내 폐업이 예상되오니 1년 미만 단기계약을 제외한 1년 이상 장기계약에 신중하시기 바랍니다.')
      elif y_pred_3 > 0.5:
            print(f'3년 내 폐업이 예상되오니 2년 이상 장기계약에 신중하시기 바랍니다.')
      elif y_pred_4 > 0.5:
            print(f'4년 내 폐업이 예상되오니 3년 이상 장기계약에 신중하시기 바랍니다.')
      elif y_pred_5 > 0.5:
            print(f'5년 내 폐업이 예상되오니 4년 이상 장기계약에 신중하시기 바랍니다.')
      else :
            print(f'5년 내 폐업 가능성이 낮습니다. 안심하고 계약하셔도 좋습니다.')

In [19]:
for i in [X.iloc[15015]]:
    print_proba(i)

해당 기업의 예상 폐업률은 
1년 내 2.0%, 
2년 내 8.9%, 
3년 내 3.0%, 
4년 내 21.4%, 
5년 내 10.4% 입니다.
5년 내 폐업 가능성이 낮습니다. 안심하고 계약하셔도 좋습니다.
