In [None]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss,accuracy_score

# pycaret으로 직업 예측 진행

# 데이터 전처리

In [None]:
train = pd.read_csv('./data/train_occpy_pred_final.csv')
test = pd.read_csv('./data/test_occpy_pred_final.csv')

In [None]:
train.gender = train.gender.replace({'F' : 0, 'M' : 1})
train.car = train.car.replace({'N' : 0, 'Y' : 1})
train.reality = train.reality.replace({'N' : 0, 'Y' : 1})
train['age'] = train.DAYS_BIRTH.apply(lambda x : -x // 365)
train.DAYS_EMPLOYED = (-1) * train.DAYS_EMPLOYED 
train.loc[(train.DAYS_EMPLOYED < 0), 'DAYS_EMPLOYED'] = 0
train.begin_month = (-1) * train.begin_month

train = train.drop(['Unnamed: 0','DAYS_BIRTH'], axis = 1)
train.head(3)

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,credit,age
0,0,0,0,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,4709,0,0,0,Accountants,2.0,6.0,1.0,38
1,0,0,1,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,1540,0,0,1,Laborers,3.0,5.0,1.0,31
2,1,1,1,0,450000.0,Working,Higher education,Married,House / apartment,4434,0,1,0,Managers,2.0,22.0,2.0,52


In [None]:
test.gender = test.gender.replace({'F' : 0, 'M' : 1})
test.car = test.car.replace({'N' : 0, 'Y' : 1})
test.reality = test.reality.replace({'N' : 0, 'Y' : 1})
test['age'] = test.DAYS_BIRTH.apply(lambda x : -x // 365)
test.DAYS_EMPLOYED = (-1) * test.DAYS_EMPLOYED 
test.loc[(test.DAYS_EMPLOYED < 0), 'DAYS_EMPLOYED'] = 0
test.begin_month = (-1) * test.begin_month

test = test.drop(['Unnamed: 0','DAYS_BIRTH'], axis = 1)
test.head(3)

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,age
0,1,1,0,0,112500.0,Pensioner,Secondary / secondary special,Civil marriage,House / apartment,0,0,1,0,Security staff,2.0,60.0,60
1,0,0,1,0,135000.0,State servant,Higher education,Married,House / apartment,8671,0,1,0,Core staff,2.0,36.0,51
2,0,0,1,0,69372.0,Working,Secondary / secondary special,Married,House / apartment,217,1,1,0,Laborers,2.0,40.0,43


In [None]:
print(train.shape, test.shape)

(26457, 18) (10000, 17)


In [None]:
y = train.credit
X = train.drop(['credit'], axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
numeric_features = ['child_num', 'income_total', 'DAYS_EMPLOYED', 'family_size', 'begin_month', 'age']
numeric_transformer = StandardScaler()

categorical_features = ['income_type', 'edu_type', 'family_type', 'house_type','occyp_type']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown = 'ignore')


preprocessor = ColumnTransformer(
                transformers=[
                    ('num', numeric_transformer, numeric_features),
                    ('cat', categorical_transformer, categorical_features)
                ], remainder='passthrough'
                )

In [None]:
preprocessor.fit(X_train)
scaled_X_train = preprocessor.transform(X_train)
scaled_X_test = preprocessor.transform(X_test)

## scaled_X_train, y_train 으로 학습
## scaled_X_test, y_test로 예측

In [None]:
y_train = y_train.astype(int)

In [None]:
from ngboost import NGBClassifier
from ngboost.distns import k_categorical

### Try 1

In [None]:
ngb = NGBClassifier(Dist=k_categorical(3),verbose = False, # tell ngboost that there are 3 possible outcomes
                    n_estimators=100, learning_rate=0.01,
                    minibatch_frac=0.5, col_sample=0.5)
ngb.fit(scaled_X_train,y_train)

NGBClassifier(col_sample=0.5, minibatch_frac=0.5, n_estimators=100,
              random_state=RandomState(MT19937) at 0x105BC7B40, verbose=False)

In [None]:
# val 데이터 예측
predictions = ngb.predict_proba(scaled_X_test)

In [None]:
# val 데이터 예측 logloss 값
logloss = log_loss(y_test,predictions)
print(f"logloss: {logloss}")

logloss: 0.8233832643811121


### Try 2

In [None]:
ngb1 = NGBClassifier(Dist=k_categorical(3),verbose = False, # tell ngboost that there are 3 possible outcomes
                    n_estimators=100, learning_rate=0.01)
ngb1.fit(scaled_X_train,y_train)

NGBClassifier(n_estimators=100,
              random_state=RandomState(MT19937) at 0x105BC7B40, verbose=False)

In [None]:
# val 데이터 예측
predictions1 = ngb1.predict_proba(scaled_X_test)

logloss1 = log_loss(y_test,predictions1)
print(f"logloss: {logloss1}")

logloss: 0.8100485926366492


### Try 3

In [None]:
ngb2 = NGBClassifier(Dist=k_categorical(3),verbose = False)
ngb2.fit(scaled_X_train,y_train)

NGBClassifier(random_state=RandomState(MT19937) at 0x105BC7B40, verbose=False)

In [None]:
predictions2 = ngb2.predict_proba(scaled_X_test)

logloss2 = log_loss(y_test,predictions2)
print(f"logloss: {logloss2}")

logloss: 0.801059189602322


### Try 4

In [None]:
ngb3 = NGBClassifier(Dist=k_categorical(3), # tell ngboost that there are 3 possible outcomes
                    n_estimators=1000)
ngb3.fit(scaled_X_train,y_train)

[iter 0] loss=0.8830 val_loss=0.0000 scale=2.0000 norm=6.8284
[iter 100] loss=0.8095 val_loss=0.0000 scale=1.0000 norm=3.3004
[iter 200] loss=0.8014 val_loss=0.0000 scale=2.0000 norm=6.6034
[iter 300] loss=0.7979 val_loss=0.0000 scale=1.0000 norm=3.3043
[iter 400] loss=0.7961 val_loss=0.0000 scale=1.0000 norm=3.3078
[iter 500] loss=0.7952 val_loss=0.0000 scale=0.5000 norm=1.6550
[iter 600] loss=0.7947 val_loss=0.0000 scale=0.5000 norm=1.6553
[iter 700] loss=0.7942 val_loss=0.0000 scale=0.5000 norm=1.6555
[iter 800] loss=0.7939 val_loss=0.0000 scale=1.0000 norm=3.3113
[iter 900] loss=0.7937 val_loss=0.0000 scale=0.2500 norm=0.8279


NGBClassifier(n_estimators=1000,
              random_state=RandomState(MT19937) at 0x105BC7B40)

In [None]:
predictions3 = ngb3.predict_proba(scaled_X_test)

logloss3 = log_loss(y_test,predictions3)
print(f"logloss: {logloss3}")

logloss: 0.8021348717806368


### Try 5

In [None]:
ngb4 = NGBClassifier(Dist=k_categorical(3), # tell ngboost that there are 3 possible outcomes
                    learning_rate=0.1)
ngb4.fit(scaled_X_train,y_train)

[iter 0] loss=0.8830 val_loss=0.0000 scale=2.0000 norm=6.8284
[iter 100] loss=0.7894 val_loss=0.0000 scale=0.5000 norm=1.6524
[iter 200] loss=0.7863 val_loss=0.0000 scale=0.0010 norm=0.0032
[iter 300] loss=0.7863 val_loss=0.0000 scale=0.0156 norm=0.0516
[iter 400] loss=0.7862 val_loss=0.0000 scale=0.0010 norm=0.0032


NGBClassifier(learning_rate=0.1,
              random_state=RandomState(MT19937) at 0x105BC7B40)

In [None]:
predictions4 = ngb4.predict_proba(scaled_X_test)

logloss4 = log_loss(y_test,predictions4)
print(f"logloss: {logloss4}")

logloss: 0.8023826809693817


### Try 6

In [None]:
ngb5 = NGBClassifier(Dist=k_categorical(3), # tell ngboost that there are 3 possible outcomes
                    n_estimators=1000,learning_rate=0.1)
ngb5.fit(scaled_X_train,y_train)

[iter 0] loss=0.8830 val_loss=0.0000 scale=2.0000 norm=6.8284
[iter 100] loss=0.7894 val_loss=0.0000 scale=0.5000 norm=1.6524
[iter 200] loss=0.7863 val_loss=0.0000 scale=0.0010 norm=0.0032
[iter 300] loss=0.7863 val_loss=0.0000 scale=0.0156 norm=0.0516
[iter 400] loss=0.7862 val_loss=0.0000 scale=0.0010 norm=0.0032
[iter 500] loss=0.7862 val_loss=0.0000 scale=0.0010 norm=0.0032
[iter 600] loss=0.7862 val_loss=0.0000 scale=0.0010 norm=0.0032
[iter 700] loss=0.7862 val_loss=0.0000 scale=0.0010 norm=0.0032
[iter 800] loss=0.7862 val_loss=0.0000 scale=0.0010 norm=0.0032
[iter 900] loss=0.7861 val_loss=0.0000 scale=0.0010 norm=0.0032


NGBClassifier(learning_rate=0.1, n_estimators=1000,
              random_state=RandomState(MT19937) at 0x105BC7B40)

In [None]:
predictions5 = ngb5.predict_proba(scaled_X_test)

logloss5 = log_loss(y_test,predictions5)
print(f"logloss: {logloss5}")

logloss: 0.8025547981181805


## 결론

## 하이퍼파라미터 디폴트값이 가장 성능이 좋게 나옴

In [None]:
ngb_model = NGBClassifier(Dist=k_categorical(3)) # tell ngboost that there are 3 possible outcomes
ngb_model.fit(scaled_X_train,y_train)

[iter 0] loss=0.8830 val_loss=0.0000 scale=2.0000 norm=6.8284
[iter 100] loss=0.8095 val_loss=0.0000 scale=1.0000 norm=3.3004
[iter 200] loss=0.8014 val_loss=0.0000 scale=2.0000 norm=6.6034
[iter 300] loss=0.7979 val_loss=0.0000 scale=1.0000 norm=3.3043
[iter 400] loss=0.7961 val_loss=0.0000 scale=1.0000 norm=3.3078


NGBClassifier(random_state=RandomState(MT19937) at 0x105BC7B40)

In [None]:
pred = ngb_model.predict_proba(scaled_X_test)

val_logloss = log_loss(y_test,pred)
print(f"logloss: {val_logloss}")

logloss: 0.8010000619961829


## 성능 확인 (데이콘 제출)

In [None]:
numeric_features = ['child_num', 'income_total', 'DAYS_EMPLOYED', 'family_size', 'begin_month', 'age']
numeric_transformer = StandardScaler()

categorical_features = ['income_type', 'edu_type', 'family_type', 'house_type','occyp_type']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown = 'ignore')


preprocessor = ColumnTransformer(
                transformers=[
                    ('num', numeric_transformer, numeric_features),
                    ('cat', categorical_transformer, categorical_features)
                ], remainder='passthrough'
                )

preprocessor.fit(test)

ColumnTransformer(remainder='passthrough',
                  transformers=[('num', StandardScaler(),
                                 ['child_num', 'income_total', 'DAYS_EMPLOYED',
                                  'family_size', 'begin_month', 'age']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['income_type', 'edu_type', 'family_type',
                                  'house_type', 'occyp_type'])])

In [None]:
scaled_test = preprocessor.transform(test)
scaled_test.shape

(10000, 51)

In [None]:
test_pred = ngb_model.predict_proba(scaled_test)

In [None]:
submission = pd.read_csv('data/sample_submission.csv')
submission.loc[:,1:] = test_pred
submission

  submission.loc[:,1:] = test_pred


Unnamed: 0,index,0,1,2
0,26457,0.099329,0.173343,0.727328
1,26458,0.108902,0.173543,0.717555
2,26459,0.113690,0.225014,0.661296
3,26460,0.104797,0.180591,0.714612
4,26461,0.116748,0.222793,0.660459
...,...,...,...,...
9995,36452,0.106142,0.186262,0.707596
9996,36453,0.096015,0.235599,0.668386
9997,36454,0.097253,0.160048,0.742699
9998,36455,0.097184,0.175862,0.726954


In [None]:
submission.to_csv('ngboost_submission_final.csv',index=False)

In [None]:
submission

Unnamed: 0,index,0,1,2
0,26457,0.099329,0.173343,0.727328
1,26458,0.108902,0.173543,0.717555
2,26459,0.113690,0.225014,0.661296
3,26460,0.104797,0.180591,0.714612
4,26461,0.116748,0.222793,0.660459
...,...,...,...,...
9995,36452,0.106142,0.186262,0.707596
9996,36453,0.096015,0.235599,0.668386
9997,36454,0.097253,0.160048,0.742699
9998,36455,0.097184,0.175862,0.726954
