In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

from vecstack import stacking, StackingTransformer

from sklearn.metrics import log_loss,accuracy_score

from tensorflow.keras import utils

# pycaret으로 직업 예측 진행

# 데이터 전처리

In [2]:
train = pd.read_csv('./data/train_occpy_pred_final.csv')
test = pd.read_csv('./data/test_occpy_pred_final.csv')

In [3]:
train.gender = train.gender.replace({'F' : 0, 'M' : 1})
train.car = train.car.replace({'N' : 0, 'Y' : 1})
train.reality = train.reality.replace({'N' : 0, 'Y' : 1})
train['age'] = train.DAYS_BIRTH.apply(lambda x : -x // 365)
train.DAYS_EMPLOYED = (-1) * train.DAYS_EMPLOYED 
train.loc[(train.DAYS_EMPLOYED < 0), 'DAYS_EMPLOYED'] = 0
train.begin_month = (-1) * train.begin_month

train = train.drop(['Unnamed: 0','DAYS_BIRTH'], axis = 1)
train.head(3)

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,credit,age
0,0,0,0,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,4709,0,0,0,Accountants,2.0,6.0,1.0,38
1,0,0,1,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,1540,0,0,1,Laborers,3.0,5.0,1.0,31
2,1,1,1,0,450000.0,Working,Higher education,Married,House / apartment,4434,0,1,0,Managers,2.0,22.0,2.0,52


In [4]:
test.gender = test.gender.replace({'F' : 0, 'M' : 1})
test.car = test.car.replace({'N' : 0, 'Y' : 1})
test.reality = test.reality.replace({'N' : 0, 'Y' : 1})
test['age'] = test.DAYS_BIRTH.apply(lambda x : -x // 365)
test.DAYS_EMPLOYED = (-1) * test.DAYS_EMPLOYED 
test.loc[(test.DAYS_EMPLOYED < 0), 'DAYS_EMPLOYED'] = 0
test.begin_month = (-1) * test.begin_month

test = test.drop(['Unnamed: 0','DAYS_BIRTH'], axis = 1)
test.head(3)

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,age
0,1,1,0,0,112500.0,Pensioner,Secondary / secondary special,Civil marriage,House / apartment,0,0,1,0,Security staff,2.0,60.0,60
1,0,0,1,0,135000.0,State servant,Higher education,Married,House / apartment,8671,0,1,0,Core staff,2.0,36.0,51
2,0,0,1,0,69372.0,Working,Secondary / secondary special,Married,House / apartment,217,1,1,0,Laborers,2.0,40.0,43


In [5]:
y = train.credit
X = train.drop(['credit'], axis = 1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [8]:
numeric_features = ['child_num', 'income_total', 'DAYS_EMPLOYED', 'family_size', 'begin_month', 'age']
numeric_transformer = StandardScaler()

categorical_features = ['income_type', 'edu_type', 'family_type', 'house_type','occyp_type']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown = 'ignore')


preprocessor = ColumnTransformer(
                transformers=[
                    ('num', numeric_transformer, numeric_features),
                    ('cat', categorical_transformer, categorical_features)
                ], remainder='passthrough'
                )

In [9]:
preprocessor.fit(X_train)
scaled_X_train = preprocessor.transform(X_train)
scaled_X_test = preprocessor.transform(X_test)



## scaled_X_train, y_train 으로 학습
## scaled_X_test, y_test 로 예측

### Stacking

In [15]:
xgb_clf = XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
               colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7,
               early_stopping_rounds=None, enable_categorical=False,
               eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
               importance_type=None, interaction_constraints='',
               learning_rate=0.3, max_bin=256, max_cat_to_onehot=4,
               max_delta_step=0, max_depth=8, max_leaves=0, min_child_weight=2,
               missing=np.nan, monotone_constraints='()', n_estimators=170,
               n_jobs=-1, num_parallel_tree=1, objective='multi:softprob',
               predictor='auto', random_state=42, reg_alpha=10)

lgbm_clf = LGBMClassifier(bagging_fraction=0.8, bagging_freq=3, boosting_type='gbdt',
                class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
                importance_type='split', learning_rate=0.2, max_depth=-1,
                min_child_samples=6, min_child_weight=0.001, min_split_gain=0.6,
                n_estimators=100, n_jobs=-1, num_leaves=30, objective=None,
                random_state=42, reg_alpha=0.001, reg_lambda=5, silent='warn',
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

gb_clf = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=8,
                            max_features='sqrt', max_leaf_nodes=None,
                            min_impurity_decrease=0.001, min_impurity_split=None,
                            min_samples_leaf=5, min_samples_split=10,
                            min_weight_fraction_leaf=0.0, n_estimators=290,
                            n_iter_no_change=None, 
                            random_state=42, subsample=0.25, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [22]:
models = [xgb_clf, lgbm_clf, gb_clf]

S_train, S_test = stacking(models, 
                       scaled_X_train, y_train, scaled_X_test, 
                       regression = False, 
                       metric = accuracy_score,
#                        needs_proba = True,                           
                       n_folds = 5, stratified = True, shuffle = True, 
                       random_state = 0, verbose = 1)

task:         [classification]
n_classes:    [3]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [XGBClassifier]
Parameters: { "max_cat_to_onehot" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "max_cat_to_onehot" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "max_cat_to_onehot" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "max_cat_to_onehot" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "max_cat_to_onehot" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


    ----
    MEAN:     [0.69969289] + [0.00308887]
    FULL:     [0.69969289]

model  1:     [LGBMClassifier]
    ----
    MEAN:     [0.69562958] + [0.00344488]
    FULL:     [0.69562958]

model  2:     [GradientBoostingClassifier]
    ----
    MEAN:     [0.69624380] + [0.00369440]
    FULL:     [0.69624380]



In [25]:
xgb_final = XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
               colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7,
               early_stopping_rounds=None, enable_categorical=False,
               eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
               importance_type=None, interaction_constraints='',
               learning_rate=0.3, max_bin=256, max_cat_to_onehot=4,
               max_delta_step=0, max_depth=8, max_leaves=0, min_child_weight=2,
               missing=np.nan, monotone_constraints='()', n_estimators=170,
               n_jobs=-1, num_parallel_tree=1, objective='multi:softprob',
               predictor='auto', random_state=42, reg_alpha=10)

In [26]:
xgb_final.fit(S_train, y_train)
pred_proba = xgb_final.predict_proba(S_test)




Parameters: { "max_cat_to_onehot" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [30]:
pred = xgb_final.predict(S_test)
print(accuracy_score(y_test, pred))

0.7106953892668179


In [None]:
log_loss(y_test, pred_proba)
# 0.7652126036905378

In [23]:
train_y = train.credit
train_X = train.drop('credit', axis = 1)

In [24]:
numeric_features = ['child_num', 'income_total', 'DAYS_EMPLOYED', 'family_size', 'begin_month', 'age']
numeric_transformer = StandardScaler()

categorical_features = ['income_type', 'edu_type', 'family_type', 'house_type','occyp_type']
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown = 'ignore')


preprocessor = ColumnTransformer(
                transformers=[
                    ('num', numeric_transformer, numeric_features),
                    ('cat', categorical_transformer, categorical_features)
                ], remainder='passthrough'
                )

In [25]:
preprocessor.fit(train_X)
scaled_train = preprocessor.transform(train_X)
scaled_test = preprocessor.transform(test)

In [26]:
xgb_clf = XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
               colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7,
               early_stopping_rounds=None, enable_categorical=False,
               eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
               importance_type=None, interaction_constraints='',
               learning_rate=0.3, max_bin=256, max_cat_to_onehot=4,
               max_delta_step=0, max_depth=8, max_leaves=0, min_child_weight=2,
               missing=np.nan, monotone_constraints='()', n_estimators=170,
               n_jobs=-1, num_parallel_tree=1, objective='multi:softprob',
               predictor='auto', random_state=42, reg_alpha=10)

lgbm_clf = LGBMClassifier(bagging_fraction=0.8, bagging_freq=3, boosting_type='gbdt',
                class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
                importance_type='split', learning_rate=0.2, max_depth=-1,
                min_child_samples=6, min_child_weight=0.001, min_split_gain=0.6,
                n_estimators=100, n_jobs=-1, num_leaves=30, objective=None,
                random_state=42, reg_alpha=0.001, reg_lambda=5, silent='warn',
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

gb_clf = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                            learning_rate=0.1, loss='deviance', max_depth=8,
                            max_features='sqrt', max_leaf_nodes=None,
                            min_impurity_decrease=0.001, min_impurity_split=None,
                            min_samples_leaf=5, min_samples_split=10,
                            min_weight_fraction_leaf=0.0, n_estimators=290,
                            n_iter_no_change=None, 
                            random_state=42, subsample=0.25, tol=0.0001,
                            validation_fraction=0.1, verbose=0,
                            warm_start=False)

In [39]:
estimators = [
    ('XGB', xgb_clf),
    ('LGBM', lgbm_clf),
    ('GB', gb_clf)
    ]

In [40]:
stack = StackingTransformer(estimators,
                       regression = False, 
                       metric = accuracy_score,                    
                       n_folds = 5, stratified = True, shuffle = True, 
                       random_state = 0, verbose = 1)

stack.fit(scaled_train, train_y)

S_train = stack.transform(scaled_train)
S_test = stack.transform(scaled_test)

model = XGBClassifier(seed = 0, n_jobs = -1, learning_rate = 0.1, n_estimators = 100, max_depth = 3, eval_metric='mlogloss') 
model = model.fit(S_train, train_y)

y_pred_proba = model.predict_proba(S_test) 

In [41]:
stack.fit(scaled_train, train_y)

task:         [classification]
n_classes:    [3]
metric:       [accuracy_score]
variant:      [A]
n_estimators: [3]

estimator  0: [XGB: XGBClassifier]
Parameters: { "max_cat_to_onehot" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "max_cat_to_onehot" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "max_cat_to_onehot" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "max_cat_to_onehot" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






Parameters: { "max_cat_to_onehot" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


    MEAN:     [0.70586246] + [0.00285516]

estimator  1: [LGBM: LGBMClassifier]
    MEAN:     [0.69773614] + [0.00339640]

estimator  2: [GB: GradientBoostingClassifier]
    MEAN:     [0.70578680] + [0.00477084]



StackingTransformer(estimators=[('XGB',
                                 XGBClassifier(base_score=0.5, booster='gbtree',
                                               callbacks=None,
                                               colsample_bylevel=1,
                                               colsample_bynode=1,
                                               colsample_bytree=0.7,
                                               early_stopping_rounds=None,
                                               enable_categorical=False,
                                               eval_metric=None, gamma=0,
                                               gpu_id=-1,
                                               grow_policy='depthwise',
                                               importance_type=None,
                                               interaction_constraints='',
                                               learning_rate=0.3, max_bin=256,
                                     

In [42]:
S_train = stack.transform(scaled_train)
S_test = stack.transform(scaled_test)

Train set was detected.
Transforming...

estimator  0: [XGB: XGBClassifier]
    DONE

estimator  1: [LGBM: LGBMClassifier]
    DONE

estimator  2: [GB: GradientBoostingClassifier]
    DONE

Transforming...

estimator  0: [XGB: XGBClassifier]
    DONE

estimator  1: [LGBM: LGBMClassifier]
    DONE

estimator  2: [GB: GradientBoostingClassifier]
    DONE



In [44]:
model = XGBClassifier(seed = 0, n_jobs = -1, learning_rate = 0.1, n_estimators = 100, max_depth = 3, eval_metric='mlogloss') 
model = model.fit(S_train, train_y) 



In [45]:
y_pred = model.predict(S_test) 
print(y_pred)

[2. 2. 2. ... 2. 2. 2.]


In [49]:
y_pred_proba = model.predict_proba(S_test) 
print(y_pred_proba)

[[0.10300916 0.17039236 0.72659844]
 [0.10300916 0.17039236 0.72659844]
 [0.10300916 0.17039236 0.72659844]
 ...
 [0.10300916 0.17039236 0.72659844]
 [0.10300916 0.17039236 0.72659844]
 [0.10300916 0.17039236 0.72659844]]


In [55]:
submission = pd.read_csv('./data/sample_submission.csv')
submission.loc[:, 1:] = y_pred_proba
submission

  submission.loc[:, 1:] = y_pred_proba


Unnamed: 0,index,0,1,2
0,26457,0.103009,0.170392,0.726598
1,26458,0.103009,0.170392,0.726598
2,26459,0.103009,0.170392,0.726598
3,26460,0.103009,0.170392,0.726598
4,26461,0.103009,0.170392,0.726598
...,...,...,...,...
9995,36452,0.103009,0.170392,0.726598
9996,36453,0.103009,0.170392,0.726598
9997,36454,0.103009,0.170392,0.726598
9998,36455,0.103009,0.170392,0.726598


In [57]:
submission.to_csv('./data/submission.csv', index = False)