In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def Predict_Grade(dataname):
    global test

    # 샘플 데이터
    df = pd.read_csv('train.csv')

    ############################## 전처리 ##############################
    
    # ID를 제외한 데이터
    df = df.drop('ID', axis = 1)

    # 대출기간 정리(문자열 삭제)
    df['대출기간'] = df['대출기간'].apply(lambda x: x.strip().split(' ')[0])

    # 근로기간 불명확값 삭제
    df.drop(df[df['근로기간'] == 'Unknown'].index, axis = 0, inplace = True)

    # 근로기간 공백삭제
    df['근로기간'] = df['근로기간'].apply(lambda x: x.strip().split(' ')[0])

    # 근로기간 10년이상 정리
    df.loc[df['근로기간'] == '10+', '근로기간'] = '10'
    df.loc[df['근로기간'] == '10+years', '근로기간'] = '10'

    # 근로기간 1년미만 정리
    df.loc[df['근로기간'] == '<', '근로기간'] = '0'
    df.loc[df['근로기간'] == '<1', '근로기간'] = '0'

    # 주택소유상태 ANY 삭제
    df.drop(df[df['주택소유상태'] == 'ANY'].index, axis = 0, inplace = True)

    # 컬럼명 수정
    df.rename(columns = {'부채_대비_소득_비율': '부채대비소득비'}, inplace = True)
    df.rename(columns = {'최근_2년간_연체_횟수':'2년내연체횟수'}, inplace = True)


    # 결측치 확인/삭제
    try:
        df.dropna(df, axis = 0, inplace = True)

    except:
        pass

    
    # 이상치 측정/제거
    def Delete_outlier(columns):
        try:
            # 사분위수 Q1(데이터의 25%), Q3(데이터의 75%)
            q1 = df[f'{columns}'].quantile(0.25)
            q3 = df[f'{columns}'].quantile(0.75)

            # IQR(InterQurtile Range, 4분위수 범위)
            iqr = q3 - q1

            # 상한선, 하한선
            up_b = q3 + 1.5 * iqr
            low_b = q1 - 1.5 * iqr

            # 이상치
            outlier_upper = df.loc[df[f'{columns}'] > up_b]
            outlier_lower = df.loc[df[f'{columns}'] < low_b]

            # 이상치 제거
            df.drop(outlier_upper.index, inplace = True)
            df.drop(outlier_lower.index, inplace = True)
            
        except:
            pass

    Delete_outlier('대출금액')
    Delete_outlier('연간소득')
    Delete_outlier('부채대비소득비')
    Delete_outlier('총상환원금')
    Delete_outlier('총상환이자')
    Delete_outlier('총연체금액')




    ############################## 머신 러닝 ##############################
    
    # 데이터 분할
    y = df['대출등급']
    x = pd.get_dummies(df.drop('대출등급', axis = 1))
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state= 111)
    
    # 표준화
    scaler = StandardScaler()
    x_train_std = scaler.fit_transform(x_train)
    x_test_std = scaler.transform(x_test)

    # 모델 트레이닝 1차
    lgb_clf = LGBMClassifier(num_classes =7, num_leaves=50, min_data_in_leaf=10, max_depth = 10)
    lgb_clf.fit(x_train_std, y_train)
    y_pred = lgb_clf.predict(x_test_std)

    # 모델 트레이닝 2차
    model = BaggingClassifier(base_estimator = lgb_clf, random_state = 111)
    model.fit(x_train_std, y_train)
    y_pred = model.predict(x_test_std)


    # 결과 출력
    test = {}
    Id = []
    for i in range(len(y_pred)):
        Id.append(f'TEST_{"0"*(5-len(str(i)))}{i}')
        
    test['ID'] = Id
    test['대출등급'] = list(y_pred)
    
    return pd.DataFrame(test)

In [6]:
Predict_Grade("train.csv")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001849 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1474
[LightGBM] [Info] Number of data points in the train set: 61240, number of used features: 36
[LightGBM] [Info] Start training from score -1.770402
[LightGBM] [Info] Start training from score -1.151385
[LightGBM] [Info] Start training from score -1.203918
[LightGBM] [Info] Start training from score -2.013720
[LightGBM] [Info] Start training from score -2.743112
[LightGBM] [Info] Start training from score -4.327994
[LightGBM] [Info] Start training from score -6.005276




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1474
[LightGBM] [Info] Number of data points in the train set: 61240, number of used features: 36
[LightGBM] [Info] Start training from score -1.759149
[LightGBM] [Info] Start training from score -1.165845
[LightGBM] [Info] Start training from score -1.199899
[LightGBM] [Info] Start training from score -1.999268
[LightGBM] [Info] Start training from score -2.752543
[LightGBM] [Info] Start training from score -4.339195
[LightGBM] [Info] Start training from score -5.922689
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001983 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1474
[LightGBM

Unnamed: 0,ID,대출등급
0,TEST_00000,C
1,TEST_00001,E
2,TEST_00002,A
3,TEST_00003,E
4,TEST_00004,E
...,...,...
15305,TEST_15305,D
15306,TEST_15306,A
15307,TEST_15307,D
15308,TEST_15308,B


In [9]:
pd.DataFrame(test).head(20)

Unnamed: 0,ID,대출등급
0,TEST_00000,C
1,TEST_00001,E
2,TEST_00002,A
3,TEST_00003,E
4,TEST_00004,E
5,TEST_00005,A
6,TEST_00006,D
7,TEST_00007,B
8,TEST_00008,A
9,TEST_00009,E
