<a href="https://colab.research.google.com/github/SanGyuk-Raccoon/DACON_1/blob/main/1_bassline_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [13]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from lightgbm import LGBMClassifier
import xgboost

from sklearn.metrics import log_loss

In [4]:
# Load DATA
train = pd.read_csv("/content/drive/MyDrive/DACON/[월간 데이콘] 14. 신용카드 사용자 연체 예측 AI/train.csv")

test = pd.read_csv("/content/drive/MyDrive/DACON/[월간 데이콘] 14. 신용카드 사용자 연체 예측 AI/test.csv")

In [49]:
# pre_processing
def pre_processing(DATA) :
  data = DATA

  ## Delete index
  data = data.drop('index', axis = 1)

  ## Delete FLAG_MOBIL : train + test에서 값이 1밖에 없음 -> 무의미함
  data = data.drop('FLAG_MOBIL', axis = 1)

  ## Delete occyp_type : 결측값이 있어서 삭제함
  data = data.drop('occyp_type', axis = 1)

  ## Mapping Categorical variable
  # 1. gender
  mapping_gender = {
      'F' : 0, # Female = 0
      'M' : 1  # Male = 1
  }
  data.gender = data.gender.map(mapping_gender)

  # 2. car
  mapping_car = {
      'N' : 0,
      'Y' : 1
  }
  data.car = data.car.map(mapping_car)

  # 3. reality
  mapping_reality = {
      'N' : 0,
      'Y' : 1
  }
  data.reality = data.reality.map(mapping_reality)

  # 4. edu_type : Ordinal variable
  mapping_edu_type = {
    'Lower secondary' : 0, # 중학교 미만
    'Secondary / secondary special' : 1, #중학교
    'Incomplete higher' : 2, # 고등학교 중퇴
    'Higher education' : 3, # 고등학교 졸업
    'Academic degree' : 4 # 학사 이상
  }
  data.edu_type = data.edu_type.map(mapping_edu_type)

  ## one-hot encoding
  var = ['income_type',
        'family_type',
        'house_type']

  onehot_encoder = OneHotEncoder()
  onehot_encoder.fit(data.loc[:, var])
  onehot_df = pd.DataFrame(onehot_encoder.transform(train.loc[:,var]).toarray(), 
              columns=onehot_encoder.get_feature_names(var))

  data.drop(var, inplace = True, axis = 1)
  data = pd.concat([data, onehot_df], axis = 1)

  ## child num. 5를 최대로 설정
  data.loc[data['child_num'] > 5,'child_num'] = 5

  ## family_size 7을 최대로 설정
  data.loc[data['family_size'] > 7,'family_size'] = 7

  ## DAYS_BIRTH -> 나이(월)
  data['age'] = -data['DAYS_BIRTH'] / (365 / 12)
  data.drop('DAYS_BIRTH', axis = 1, inplace = True)

  ## DAYS_EMPLOYES -> 경력(월)
  data['career'] = data['DAYS_EMPLOYED'].apply( lambda x : - x / (365 / 12) if x < 0 else 0)
  data.drop('DAYS_EMPLOYED', axis = 1, inplace = True)

  ## Begin_month -> 양수로
  data['begin_month'] = - data['begin_month']

  return data

In [65]:
# Split data set. 
target = train['credit']
train_data = train.drop('credit', axis = 1)
train_data = pre_processing(train_data)

X_train, X_valid, y_train, y_valid = train_test_split(train_data,
                                                      target,
                                                      test_size=0.2,
                                                      shuffle = True,
                                                      stratify=target,
                                                      random_state=2)

In [101]:
# lgb model
lgb_model = LGBMClassifier(n_estimators=1000)
lgb_model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], 
            early_stopping_rounds=30,
            verbose=100)

Training until validation scores don't improve for 30 rounds.
[100]	training's multi_logloss: 0.682004	valid_1's multi_logloss: 0.760092
[200]	training's multi_logloss: 0.608168	valid_1's multi_logloss: 0.74311
[300]	training's multi_logloss: 0.554609	valid_1's multi_logloss: 0.7361
[400]	training's multi_logloss: 0.507722	valid_1's multi_logloss: 0.734116
Early stopping, best iteration is:
[384]	training's multi_logloss: 0.514808	valid_1's multi_logloss: 0.733514


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [102]:
# Calculate log loss
prob_valid_lgb = lgb_model.predict_proba(X_valid)
log_loss(y_valid, prob_valid_lgb)

0.7335144371223611

In [82]:
# xgb model
dtrain = xgboost.DMatrix(data = X_train, label = y_train)
dvalid = xgboost.DMatrix(data = X_valid, label = y_valid)

eval_list = [(dtrain, 'train'),
             (dvalid, 'valid')]


params = {
    'max_depth' : 8,
    'eta' : 0.1,
    'objective' : 'multi:softprob',
    'eval_metric' : 'mlogloss',
    'early_stoppings' : 100,
    'num_class' : 3,
    'sub_sample' : 0.8
}
xgb_model = xgboost.train(params = params,
                      dtrain = dtrain,
                      evals = eval_list,
                      num_boost_round = 400,
                      verbose_eval = 10
                      )

[0]	train-mlogloss:1.05367	valid-mlogloss:1.05472
[10]	train-mlogloss:0.84678	valid-mlogloss:0.855115
[20]	train-mlogloss:0.793771	valid-mlogloss:0.809819
[30]	train-mlogloss:0.771931	valid-mlogloss:0.795368
[40]	train-mlogloss:0.75741	valid-mlogloss:0.787972
[50]	train-mlogloss:0.745758	valid-mlogloss:0.783371
[60]	train-mlogloss:0.734428	valid-mlogloss:0.778228
[70]	train-mlogloss:0.724647	valid-mlogloss:0.774352
[80]	train-mlogloss:0.715601	valid-mlogloss:0.771535
[90]	train-mlogloss:0.707355	valid-mlogloss:0.768948
[100]	train-mlogloss:0.699551	valid-mlogloss:0.766506
[110]	train-mlogloss:0.691598	valid-mlogloss:0.764024
[120]	train-mlogloss:0.683846	valid-mlogloss:0.76178
[130]	train-mlogloss:0.677422	valid-mlogloss:0.759867
[140]	train-mlogloss:0.669894	valid-mlogloss:0.757475
[150]	train-mlogloss:0.664267	valid-mlogloss:0.755968
[160]	train-mlogloss:0.658061	valid-mlogloss:0.754219
[170]	train-mlogloss:0.651056	valid-mlogloss:0.75259
[180]	train-mlogloss:0.644723	valid-mlogloss:

In [95]:
# Calculate log loss
prob_valid_xgb = xgb_model.predict(dvalid)
log_loss(y_valid, prob_valid_xgb)

0.732975275842656

In [99]:
# 

In [99]:
X_test = pre_processing(test)

In [104]:
prob_test_lgb = lgb_model.predict_proba(X_test)

In [100]:
dtest = xgboost.DMatrix(data = X_test)
prob_test_xgb = xgb_model.predict(dtest)