In [1]:
import warnings
warnings.filterwarnings('ignore')
import glob
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from catboost import CatBoostClassifier

In [2]:
train = pd.read_csv('train.csv')
train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True) 
train['occyp_type'].loc[(train.occyp_type == 'NAN')&(train.DAYS_EMPLOYED > 0)]='Unemployed'

train.fillna('NAN', inplace=True) 
train['occyp_type'].loc[(train.occyp_type == 'NAN')&(train.DAYS_EMPLOYED < 0)]='Missing'

In [3]:
test = pd.read_csv('test.csv')
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)
test['occyp_type'].loc[(test.occyp_type == 'NAN')&(test.DAYS_EMPLOYED > 0)]='Unemployed'
test['occyp_type'].loc[(test.occyp_type == 'NAN')&(test.DAYS_EMPLOYED < 0)]='Missing'

submit = pd.read_csv('submission.csv')

In [4]:
object_col = []
for col in train.columns:
    if train[col].dtype == 'object':
        object_col.append(col)

In [5]:
enc = OneHotEncoder()
enc.fit(train.loc[:,object_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
train.drop(object_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [6]:
test_onehot_df = pd.DataFrame(enc.transform(test.loc[:,object_col]).toarray(), 
             columns=enc.get_feature_names(object_col))
test.drop(object_col, axis=1, inplace=True)
test = pd.concat([test, test_onehot_df], axis=1)

In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))

In [8]:
random.seed(42)
lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
           verbose=100)
    lgb_models[fold]=lgb
    print(f'================================================================================\n\n')

Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.652377	valid_1's multi_logloss: 0.752522
[200]	training's multi_logloss: 0.56559	valid_1's multi_logloss: 0.737156
[300]	training's multi_logloss: 0.501921	valid_1's multi_logloss: 0.731843
Early stopping, best iteration is:
[348]	training's multi_logloss: 0.475822	valid_1's multi_logloss: 0.729304


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.646476	valid_1's multi_logloss: 0.764955
[200]	training's multi_logloss: 0.560614	valid_1's multi_logloss: 0.751681
[300]	training's multi_logloss: 0.497532	valid_1's multi_logloss: 0.748007
Early stopping, best iteration is:
[281]	training's multi_logloss: 0.508566	valid_1's multi_logloss: 0.747717


Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.652042	valid_1's multi_logloss: 0.758665
[200]	training's multi_logloss: 0.560771	valid_1's multi_logloss: 0.7

In [9]:
xgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values
    params={
    'objective':'multi:softprob',
    'random_state':71,
    'n_estimators':1000
    }
    model = XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric = 'mlogloss',early_stopping_rounds=30, verbose=100)
    xgb_models[fold]=model
    print(f'================================================================================\n\n')

[0]	validation_0-mlogloss:0.97436	validation_1-mlogloss:0.97758
[100]	validation_0-mlogloss:0.56747	validation_1-mlogloss:0.73381
[200]	validation_0-mlogloss:0.44779	validation_1-mlogloss:0.72234
[211]	validation_0-mlogloss:0.43764	validation_1-mlogloss:0.72251


[0]	validation_0-mlogloss:0.97304	validation_1-mlogloss:0.97925
[100]	validation_0-mlogloss:0.56072	validation_1-mlogloss:0.74882
[200]	validation_0-mlogloss:0.44335	validation_1-mlogloss:0.74027
[234]	validation_0-mlogloss:0.41432	validation_1-mlogloss:0.74270


[0]	validation_0-mlogloss:0.97442	validation_1-mlogloss:0.97858
[100]	validation_0-mlogloss:0.56273	validation_1-mlogloss:0.74339
[200]	validation_0-mlogloss:0.44721	validation_1-mlogloss:0.73763
[204]	validation_0-mlogloss:0.44292	validation_1-mlogloss:0.73735


[0]	validation_0-mlogloss:0.97382	validation_1-mlogloss:0.97803
[100]	validation_0-mlogloss:0.56483	validation_1-mlogloss:0.73918
[200]	validation_0-mlogloss:0.44943	validation_1-mlogloss:0.73279
[219]	valida

In [10]:
rf_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values
    params={'n_estimators':1000,'random_state':71,'criterion':'gini','verbose':1,'class_weight':'balanced','n_jobs':-1,'oob_score':True}
    model=RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    rf_models[fold]=model
    print(f'================================================================================\n\n')



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    6.9s finished






[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    8.3s finished






[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   10.0s finished






[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   11.7s finished






[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   13.6s finished






In [11]:
svc_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values
    params={'gamma':'auto','probability':True,'random_state':71,'class_weight':'balanced','kernel':'poly','shrinking':True,'verbose':True}
    svc_model=SVC(**params)
    model.fit(X_train, y_train)
    svc_models[fold]=model
    print(f'================================================================================\n\n')



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   13.2s finished






[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   13.2s finished






[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   12.5s finished






[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   12.3s finished






[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   16.4s finished






In [12]:
lda_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values
    
    model=LinearDiscriminantAnalysis()
    model.fit(X_train, y_train)
    lda_models[fold]= model
    print(f'================================================================================\n\n')













In [18]:
cb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values
    
    model=CatBoostClassifier(early_stopping_rounds=30, verbose=100)
    model.fit(X_train, y_train)
    cb_models[fold]= model
    print(f'================================================================================\n\n')

Learning rate set to 0.092455
0:	learn: 1.0464708	total: 180ms	remaining: 3m
100:	learn: 0.7747495	total: 1.27s	remaining: 11.3s
200:	learn: 0.7426507	total: 2.27s	remaining: 9.01s
300:	learn: 0.7149000	total: 3.29s	remaining: 7.64s
400:	learn: 0.6922723	total: 4.31s	remaining: 6.44s
500:	learn: 0.6716642	total: 5.3s	remaining: 5.28s
600:	learn: 0.6544617	total: 6.25s	remaining: 4.15s
700:	learn: 0.6378184	total: 7.2s	remaining: 3.07s
800:	learn: 0.6220550	total: 8.15s	remaining: 2.02s
900:	learn: 0.6075985	total: 9.12s	remaining: 1s
999:	learn: 0.5945231	total: 10.1s	remaining: 0us


Learning rate set to 0.092455
0:	learn: 1.0458229	total: 11ms	remaining: 11s
100:	learn: 0.7711958	total: 1.03s	remaining: 9.15s
200:	learn: 0.7389139	total: 1.99s	remaining: 7.9s
300:	learn: 0.7105477	total: 2.94s	remaining: 6.84s
400:	learn: 0.6873722	total: 3.89s	remaining: 5.81s
500:	learn: 0.6659828	total: 4.87s	remaining: 4.85s
600:	learn: 0.6474989	total: 5.82s	remaining: 3.86s
700:	learn: 0.632443

In [26]:
submit.iloc[:,1:]=0
for fold in range(5):
    submit.iloc[:,1:] += (lgb_models[fold].predict_proba(test)/20)
    submit.iloc[:,1:] += (xgb_models[fold].predict_proba(test)/20)
    submit.iloc[:,1:] += (rf_models[fold].predict_proba(test)/20)
    #submit.iloc[:,1:] += (svc_models[fold].predict_proba(test)/30)
    #submit.iloc[:,1:] += (lda_models[fold].predict_proba(test)/30)
    submit.iloc[:,1:] += (cb_models[fold].predict_proba(test)/20)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    2.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    2.3s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    2.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Paral

In [27]:
submit

Unnamed: 0,index,0,1,2
0,26457,0.053063,0.135377,0.811559
1,26458,0.259611,0.175174,0.565216
2,26459,0.061001,0.112262,0.826737
3,26460,0.110551,0.105501,0.783948
4,26461,0.082543,0.173000,0.744456
...,...,...,...,...
9995,36452,0.147239,0.252861,0.599900
9996,36453,0.189252,0.347640,0.463108
9997,36454,0.017484,0.069830,0.912685
9998,36455,0.171256,0.229883,0.598861


In [29]:
submit.to_csv('20210519_ensemble2.csv', index=False) #0.7044283413 모델이 중요!