In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold
from sklearn.preprocessing import OneHotEncoder
import random
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#불러오기
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
submit = pd.read_csv('dataset/sample_submission.csv')

In [3]:
data = pd.concat([train, test], axis = 0) #train과 test 합치기

In [4]:
data.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36457 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          36457 non-null  int64  
 1   gender         36457 non-null  object 
 2   car            36457 non-null  object 
 3   reality        36457 non-null  object 
 4   child_num      36457 non-null  int64  
 5   income_total   36457 non-null  float64
 6   income_type    36457 non-null  object 
 7   edu_type       36457 non-null  object 
 8   family_type    36457 non-null  object 
 9   house_type     36457 non-null  object 
 10  DAYS_BIRTH     36457 non-null  int64  
 11  DAYS_EMPLOYED  36457 non-null  int64  
 12  FLAG_MOBIL     36457 non-null  int64  
 13  work_phone     36457 non-null  int64  
 14  phone          36457 non-null  int64  
 15  email          36457 non-null  int64  
 16  occyp_type     25134 non-null  object 
 17  family_size    36457 non-null  float64
 18  begin_m

In [6]:
data['occyp_type'].describe()

count        25134
unique          18
top       Laborers
freq          6211
Name: occyp_type, dtype: object

In [7]:
data.isnull().sum() #occyp_type 30%가 결측치

index                0
gender               0
car                  0
reality              0
child_num            0
income_total         0
income_type          0
edu_type             0
family_type          0
house_type           0
DAYS_BIRTH           0
DAYS_EMPLOYED        0
FLAG_MOBIL           0
work_phone           0
phone                0
email                0
occyp_type       11323
family_size          0
begin_month          0
credit           10000
dtype: int64

In [8]:
data=data.drop(['index', 'FLAG_MOBIL'], axis=1)

In [9]:
data['occyp_type'].fillna("Laborers", inplace=True) #최빈값으로 채우기

In [10]:
data.isnull().sum()

gender               0
car                  0
reality              0
child_num            0
income_total         0
income_type          0
edu_type             0
family_type          0
house_type           0
DAYS_BIRTH           0
DAYS_EMPLOYED        0
work_phone           0
phone                0
email                0
occyp_type           0
family_size          0
begin_month          0
credit           10000
dtype: int64

# 열 삭제

In [11]:
data.drop(['email'],axis=1, inplace=True)
data.drop(['phone'],axis=1, inplace=True)
data.drop(['work_phone'],axis=1, inplace=True)
data.drop(['gender'],axis=1, inplace=True)
data.drop(['reality'],axis=1, inplace=True)

In [12]:
unique_len = data.apply(lambda x : len(x.unique())) #모든 열의 요소개수가 몇개인지 확인하여 unique_len에 저장

In [13]:
unique_len

car                 2
child_num           9
income_total      265
income_type         5
edu_type            5
family_type         5
house_type          6
DAYS_BIRTH       7183
DAYS_EMPLOYED    3640
occyp_type         18
family_size        10
begin_month        61
credit              4
dtype: int64

In [14]:
group_1 = unique_len[unique_len <= 2].index #요소 개수가 2개 이하인 열의 인덱스
group_2 = unique_len[(unique_len > 2)\
             & (unique_len <= 18)].index #요소 개수가 2개 초과, 18개이하인 열의 인덱스
group_3 = unique_len[(unique_len > 18)].index #요소 개수가 18개 초과인 열의 인덱스

# group1

In [15]:
'''data['gender'] = data['gender'].replace(['F','M'], [0,  1]) #gender의 f와m을 0과1로 변환, 문자는 모델학습에 사용할 수 없다.'''
data['car'] = data['car'].replace(['N', 'Y'], [0, 1]) #마찬가지로 숫자 0과1로 변환
'''data['reality'] = data['reality'].replace(['N', 'Y'], [0, 1])#마찬가지로 숫자 0과1로 변환'''
data[group_1] #group_1 모두 숫자로 변환완료

Unnamed: 0,car
0,0
1,0
2,1
3,0
4,1
...,...
9995,1
9996,1
9997,0
9998,1


# group2

In [16]:
data = data.astype({'DAYS_BIRTH': 'int'})
data = data.astype({'DAYS_EMPLOYED': 'int'})
data = data.astype({'begin_month': 'int'})

In [17]:
from sklearn import preprocessing

In [18]:
label_encoder = preprocessing.LabelEncoder() #문자열로 이루어져 있는 값을 숫자로 인코딩해 준다

In [19]:
data[group_2].apply(lambda x : len(x.unique())) #몇개의 데이터로 이루어져 있는지 확인

child_num       9
income_type     5
edu_type        5
family_type     5
house_type      6
occyp_type     18
family_size    10
credit          4
dtype: int64

In [20]:
data['income_type'] = label_encoder.fit_transform(data['income_type']) #문자열로 이루어진 열 모두 숫자로 인코딩해 준다.
data['edu_type'] = label_encoder.fit_transform(data['edu_type'])
data['family_type'] = label_encoder.fit_transform(data['family_type'])
data['house_type'] = label_encoder.fit_transform(data['house_type'])
data['occyp_type'] = label_encoder.fit_transform(data['occyp_type'])

### ▲성능 안좋으면 각각 분리로 바꾸기

In [21]:
data[group_2] #group_2 모두 숫자로 변환 완료

Unnamed: 0,child_num,income_type,edu_type,family_type,house_type,occyp_type,family_size,credit
0,0,0,1,1,2,8,2.0,1.0
1,1,0,4,0,1,8,3.0,1.0
2,0,4,1,1,1,10,2.0,2.0
3,0,0,4,1,1,14,2.0,0.0
4,0,2,1,1,1,10,2.0,2.0
...,...,...,...,...,...,...,...,...
9995,0,4,2,1,1,0,2.0,
9996,0,4,4,0,1,8,2.0,
9997,0,4,4,1,1,11,2.0,
9998,0,0,4,1,1,8,2.0,


# group3

In [22]:
# 마이너스 값 변환
def minus(x):
    return x * -1
data['begin_month'] = data['begin_month'].apply(minus)

In [23]:
# 나이 변환
def days_to_age(x):
    return (x*-1)/365
data['DAYS_BIRTH'] = data['DAYS_BIRTH'].apply(days_to_age)

In [24]:
'''def make_bin(array, n): #숫자로 변환하는 함수를 남은 변수에 적용
    array = - array #양수 값으로 변환
    _, bin_dividers = np.histogram(array, bins = n) #count는 사용하지 않을것이므로 _로 표시
    cut_categories = pd.cut(array, bin_dividers, labels = [i for i in range(n)], include_lowest=True) #구간별로 나눈다.
    bined_array = pd.factorize(cut_categories)[0] #정수 자료형으로 변환
    return bined_array'''

'def make_bin(array, n): #숫자로 변환하는 함수를 남은 변수에 적용\n    array = - array #양수 값으로 변환\n    _, bin_dividers = np.histogram(array, bins = n) #count는 사용하지 않을것이므로 _로 표시\n    cut_categories = pd.cut(array, bin_dividers, labels = [i for i in range(n)], include_lowest=True) #구간별로 나눈다.\n    bined_array = pd.factorize(cut_categories)[0] #정수 자료형으로 변환\n    return bined_array'

In [25]:
'''data['DAYS_BIRTH'] = make_bin(data['DAYS_BIRTH'], 6)
data['DAYS_EMPLOYED'] = make_bin(data['DAYS_EMPLOYED'], 6)
data['begin_month'] = make_bin(data['begin_month'], 6)
#모든 열에 적용'''

"data['DAYS_BIRTH'] = make_bin(data['DAYS_BIRTH'], 6)\ndata['DAYS_EMPLOYED'] = make_bin(data['DAYS_EMPLOYED'], 6)\ndata['begin_month'] = make_bin(data['begin_month'], 6)\n#모든 열에 적용"

In [26]:
data.dtypes

car                int64
child_num          int64
income_total     float64
income_type        int32
edu_type           int32
family_type        int32
house_type         int32
DAYS_BIRTH       float64
DAYS_EMPLOYED      int32
occyp_type         int32
family_size      float64
begin_month        int64
credit           float64
dtype: object

In [27]:
data.head()

Unnamed: 0,car,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,occyp_type,family_size,begin_month,credit
0,0,0,202500.0,0,1,1,2,38.079452,-4709,8,2.0,6,1.0
1,0,1,247500.0,0,4,0,1,31.178082,-1540,8,3.0,5,1.0
2,1,0,450000.0,4,1,1,1,52.293151,-4434,10,2.0,22,2.0
3,0,0,202500.0,0,4,1,1,41.336986,-2092,14,2.0,37,0.0
4,1,0,157500.0,2,1,1,1,41.19726,-2105,10,2.0,26,2.0


In [28]:
object_col=['car', 'income_type','edu_type','family_type','house_type', 'occyp_type', ]

In [29]:
enc = OneHotEncoder()
enc.fit(data.loc[:,object_col])


data_onehot_df = pd.DataFrame(enc.transform(data.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
data.drop(object_col, axis=1, inplace=True)
data = pd.concat([data.reset_index(drop=True), data_onehot_df], axis=1)

In [30]:
data.dtypes

child_num          int64
income_total     float64
DAYS_BIRTH       float64
DAYS_EMPLOYED      int32
family_size      float64
begin_month        int64
credit           float64
car_0            float64
car_1            float64
income_type_0    float64
income_type_1    float64
income_type_2    float64
income_type_3    float64
income_type_4    float64
edu_type_0       float64
edu_type_1       float64
edu_type_2       float64
edu_type_3       float64
edu_type_4       float64
family_type_0    float64
family_type_1    float64
family_type_2    float64
family_type_3    float64
family_type_4    float64
house_type_0     float64
house_type_1     float64
house_type_2     float64
house_type_3     float64
house_type_4     float64
house_type_5     float64
occyp_type_0     float64
occyp_type_1     float64
occyp_type_2     float64
occyp_type_3     float64
occyp_type_4     float64
occyp_type_5     float64
occyp_type_6     float64
occyp_type_7     float64
occyp_type_8     float64
occyp_type_9     float64


In [31]:
data.head()

Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,family_size,begin_month,credit,car_0,car_1,income_type_0,...,occyp_type_8,occyp_type_9,occyp_type_10,occyp_type_11,occyp_type_12,occyp_type_13,occyp_type_14,occyp_type_15,occyp_type_16,occyp_type_17
0,0,202500.0,38.079452,-4709,2.0,6,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,247500.0,31.178082,-1540,3.0,5,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,450000.0,52.293151,-4434,2.0,22,2.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,202500.0,41.336986,-2092,2.0,37,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,157500.0,41.19726,-2105,2.0,26,2.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# test, train 분리

In [32]:
test = data[-10000:] #다시 test와 train 분리하기

In [33]:
train = data[:-10000]

In [34]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 26457 to 36456
Data columns (total 48 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   child_num      10000 non-null  int64  
 1   income_total   10000 non-null  float64
 2   DAYS_BIRTH     10000 non-null  float64
 3   DAYS_EMPLOYED  10000 non-null  int32  
 4   family_size    10000 non-null  float64
 5   begin_month    10000 non-null  int64  
 6   credit         0 non-null      float64
 7   car_0          10000 non-null  float64
 8   car_1          10000 non-null  float64
 9   income_type_0  10000 non-null  float64
 10  income_type_1  10000 non-null  float64
 11  income_type_2  10000 non-null  float64
 12  income_type_3  10000 non-null  float64
 13  income_type_4  10000 non-null  float64
 14  edu_type_0     10000 non-null  float64
 15  edu_type_1     10000 non-null  float64
 16  edu_type_2     10000 non-null  float64
 17  edu_type_3     10000 non-null  float64
 18  ed

In [35]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 48 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   child_num      26457 non-null  int64  
 1   income_total   26457 non-null  float64
 2   DAYS_BIRTH     26457 non-null  float64
 3   DAYS_EMPLOYED  26457 non-null  int32  
 4   family_size    26457 non-null  float64
 5   begin_month    26457 non-null  int64  
 6   credit         26457 non-null  float64
 7   car_0          26457 non-null  float64
 8   car_1          26457 non-null  float64
 9   income_type_0  26457 non-null  float64
 10  income_type_1  26457 non-null  float64
 11  income_type_2  26457 non-null  float64
 12  income_type_3  26457 non-null  float64
 13  income_type_4  26457 non-null  float64
 14  edu_type_0     26457 non-null  float64
 15  edu_type_1     26457 non-null  float64
 16  edu_type_2     26457 non-null  float64
 17  edu_type_3     26457 non-null  float64
 18  edu_ty

# Training
- 데이터 분리는 StratifiedKFold 를 사용하여 y값 분포를 비슷하게 분리시킴. -> 5-fold
- lightgbm의 default parameter로 훈련.
- 30번 이상 개선 없을 경우 중단.
- 각 5개의 fold를 훈련하여 저장

In [36]:
model=LGBMClassifier()

In [37]:
param_grid={'boosting' : ['gbdt'],
            'learning_rate':[0.3, 0.1, 0.03, 0.01],
            'max_depth':[-1, 10, 20, 30, 40, 50],
            'n_estimators':[1000, 1500],
            'bagging_fraction':[0.6, 0.7, 0.8, 0.9, 1],#행샘플링
            'feature_fraction':[0.6, 0.7, 0.8, 0.9, 1],
            #'scale_pos_weight':[],음수샘플수/양수샘플수
            'early_stopping_round':[100],
            'lambda_l1':[0],#default
            'min_data_in_leaf':[5, 10, 15, 20, 25],#default:20
            #'min_gain_to_split':[],
            'max_cat_group':[30, 60, 90, 120], #default:64
            'objective':['regression'],
            'metric':['multi_logloss'],
            'num_leaves':[10, 20, 31, 80, 150], #default:31
            'max_bin':[5, 15, 63, 125, 255], #이해필요
            'seed':[2020]
            }

In [38]:
'''skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))'''

"skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)\nfolds=[]\nfor train_idx, valid_idx in skf.split(train, train['credit']):\n    folds.append((train_idx, valid_idx))"

In [39]:
cv=KFold(n_splits=6, random_state=1, shuffle=True)#.split(X=train.drop(['credit'],axis=1), y=train['credit'])

In [40]:
gcv=GridSearchCV(model, param_grid=param_grid, cv=cv, scoring='f1')#, n_jobs=4)

In [None]:
gcv.fit(train.drop(['credit'], axis=1), train['credit'])
print('final params', gcv.best_params_)
print('final score', gcv.best_score_)



In [None]:
'''random.seed(42)
lgb_models={}
for fold in range(7):
    print(f'===================================={fold+1}============================================')
    X_train = train.drop(['credit'], axis=1)
    y_train = train['credit']
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, \
                                            train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values,\
                                            train['credit'][valid_idx].values
    '''norm_X_train = (X_train - X_train.min()) / ( X_train.max() - X_train.min())
    norm_X_valid = (X_valid - X_valid.min()) / ( X_valid.max() - X_valid.min())'''
    '''norm_X_train = (X_train - X_train.mean()) / ( X_train.std())
    norm_X_valid = (X_valid - X_valid.mean()) / ( X_valid.std())'''
    
    lgb = LGBMClassifier(boosting_type='gbdt',
                        learning_rate=0.03,#0.1,0.03이 젤 좋았다
                         max_depth=30,
                         n_estimators=1500,
                         objective='regression',
                         metric='multi_logloss',
                         is_training_metric=True,
                         num_leaves=150, #중요
                         feature_fraction=0.7,#0.7,#몇프로feature랜덤하게 학습 열 샘플링
                         bagging_fraction=0.7, #행 샘플링
                         bagging_freq=2,
                         seed=2020,
                         early_stopping_round=100,
                         min_data_in_leaf=5, #중요
                         tree_learner='feature',
                         extra_trees='False',
                         #categorical_feature='', #범주형 데이터 열 적기
                        )
    lgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], early_stopping_rounds=50,
           verbose=100) #verbose 횟수마다 출력
    
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')
    print(lgb.feature_importances_)'''

In [None]:
test.drop(['credit'], axis=1, inplace=True)

# Test inference
- 각 fold를 훈련시킨 lightgbm model로 predict.
- 해당 대회는 logloss score를 겨루는 것이기 때문에 각 class의 probability를 얻어야함.
- 대부분의 머신러닝 모델에서 predict, predict_proba를 구분하여 사용함.
- predict는 class 출력을 해주고 predict_proba는 class별 probability를 출력해줌.
- predict_proba를 사용하여 예측한 것을 5-fold 더하여 평균내어 앙상블.

In [None]:
submit.iloc[:,1:]=0
for fold in range(7):
    submit.iloc[:,1:] += lgb_models[fold].predict_proba(test)/7

In [None]:
submit.to_csv('submit/Guk_202105011_test_submit_ensemble.csv', index=False) # 0.7272812144

In [None]:
submit.head(20)