# 실험을 위한 기준 baseline

In [89]:
from urllib.request import urlretrieve

urlretrieve('https://drive.google.com/uc?export=download&id=1XLVFI_sK0smRVVuT8XU2s-M3lJT-68sN', './open.zip')

('./open.zip', <http.client.HTTPMessage at 0x7f0387537f90>)

In [90]:
!unzip ./open.zip

Archive:  ./open.zip
replace open/train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: open/train.csv          
  inflating: open/sample_submission.csv  
  inflating: open/test.csv           


In [105]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate

In [120]:
train = pd.read_csv('./open/train.csv')
test = pd.read_csv('./open/test.csv')

In [121]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

## Data Preprocessing

In [122]:
train.fillna('NAN', inplace=True) 
test.fillna('NAN', inplace=True) 

In [123]:
train.drop('index', axis=1, inplace=True)
test.drop('index', axis=1, inplace=True)

In [124]:
onehot_col = ['gender',
              'car',
              'reality',
              'income_type',
              'edu_type',
              'family_type',
              'house_type',
              'occyp_type']


enc = OneHotEncoder()
enc.fit(train.loc[:,onehot_col])


train_onehot_df = pd.DataFrame(enc.transform(train.loc[:,onehot_col]).toarray(), 
             columns=enc.get_feature_names(onehot_col))
train.drop(onehot_col, axis=1, inplace=True)
train = pd.concat([train, train_onehot_df], axis=1)

In [128]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 57 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   child_num                               26457 non-null  int64  
 1   income_total                            26457 non-null  float64
 2   DAYS_BIRTH                              26457 non-null  int64  
 3   DAYS_EMPLOYED                           26457 non-null  int64  
 4   FLAG_MOBIL                              26457 non-null  int64  
 5   work_phone                              26457 non-null  int64  
 6   phone                                   26457 non-null  int64  
 7   email                                   26457 non-null  int64  
 8   family_size                             26457 non-null  float64
 9   begin_month                             26457 non-null  float64
 10  credit                                  26457 non-null  fl

In [125]:
X_train = train.drop('credit', axis=1)
y_train = train['credit']

## modeling

In [126]:
scores = cross_validate(LGBMClassifier(), X_train, y_train,
                        scoring='neg_log_loss', return_train_score = True)
scores

{'fit_time': array([1.00157571, 1.02693868, 0.98590612, 0.98604298, 0.97848368]),
 'score_time': array([0.07377863, 0.0650115 , 0.06968117, 0.06406069, 0.06329632]),
 'test_score': array([-0.75942811, -0.76384856, -0.76256046, -0.76210299, -0.75800922]),
 'train_score': array([-0.68033952, -0.67795626, -0.6758315 , -0.67947138, -0.67699941])}

In [127]:
np.mean(scores['test_score'])

-0.7611898663220819