In [61]:
from collections import Counter
from sklearn.metrics import roc_auc_score, f1_score, fbeta_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import lightgbm as lgb
import os

In [71]:
# for naive lgbm
def lgb_fbeta_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.where(y_hat < 0.5, 0, 1)  
    return 'fbeta', fbeta_score(y_true, y_hat, beta=1.5), True

In [72]:
folder_path = 'data/'

In [73]:
os.listdir(folder_path)

['test.csv', 'submission.csv', 'season.csv', 'train.csv']

In [74]:
df_train = pd.read_csv(os.path.join(folder_path, 'train.csv'))
df_test = pd.read_csv(os.path.join(folder_path, 'test.csv'))

In [75]:
df_tr, df_val = train_test_split(df_train, stratify = df_train['PerStatus'], test_size=0.2, random_state=42)

In [76]:
features = ['sex', '工作分類', '職等', '廠區代碼', '管理層級',
       '工作資歷1', '工作資歷2', '工作資歷3', '工作資歷4', '工作資歷5', '專案時數', '專案總數', '當前專案角色',
       '特殊專案佔比', '工作地點', '訓練時數A', '訓練時數B', '訓練時數C', '生產總額', '榮譽數', '是否升遷',
       '升遷速度', '近三月請假數A', '近一年請假數A', '近三月請假數B', '近一年請假數B', '出差數A', '出差數B',
       '出差集中度', '年度績效等級A', '年度績效等級B', '年度績效等級C', '年齡層級', '婚姻狀況', '年資層級A',
       '年資層級B', '年資層級C', '任職前工作平均年數', '最高學歷', '畢業學校類別', '畢業科系類別', '眷屬量',
       '通勤成本', '歸屬部門']

In [77]:
tr_X = df_tr[features].values
tr_y = df_tr['PerStatus']

val_X = df_val[features].values
val_y = df_val['PerStatus']

te_X = df_test[features].values

In [78]:
lgtrain = lgb.Dataset(tr_X, tr_y)
lgvalid = lgb.Dataset(val_X, val_y)

In [91]:
params = {
          "objective" : "binary",
          "num_leaves" : 30,
          "max_depth": -1,
          "bagging_fraction" : 0.8,  # subsample
          "feature_fraction" : 0.8,  # colsample_bytree
          "bagging_freq" : 5,        # subsample_freq
          "bagging_seed" : 2018,
          "num_threads":4,
          'lambda_l1': 0.9, 
          'lambda_l2': 0.5, 
          'learning_rate': 0.1, 
          'metric': 'None',
          'is_unbalance': True,
          "verbosity" : -1 }

In [92]:
evals_result = {}

clf = lgb.train(params, lgtrain, 1000,
                valid_sets=[lgvalid, lgtrain], valid_names=['validation', 'train'],
                feval=lgb_fbeta_score, evals_result=evals_result,
                early_stopping_rounds = 200,
                verbose_eval=50)

Training until validation scores don't improve for 200 rounds
[50]	train's fbeta: 0.528535	validation's fbeta: 0.286312
[100]	train's fbeta: 0.672099	validation's fbeta: 0.297168
[150]	train's fbeta: 0.777074	validation's fbeta: 0.299325
[200]	train's fbeta: 0.856815	validation's fbeta: 0.263022
[250]	train's fbeta: 0.886395	validation's fbeta: 0.23908
[300]	train's fbeta: 0.94706	validation's fbeta: 0.22807
Early stopping, best iteration is:
[120]	train's fbeta: 0.719237	validation's fbeta: 0.306898


In [93]:
pred_val = np.where(clf.predict(val_X) < 0.5, 0, 1)

In [94]:
Counter(pred_val)

Counter({0: 2559, 1: 320})

In [95]:
fbeta_score(val_y, pred_val, beta=1.5)

0.30689782368129837

# Test Inference

In [96]:
pred_test = np.where(clf.predict(te_X) < 0.5, 0, 1)

In [97]:
Counter(pred_test)

Counter({0: 3458, 1: 281})

In [98]:
sub = pd.read_csv(os.path.join(folder_path, 'submission.csv'))

In [99]:
sub['PerStatus'] = pred_test

In [100]:
sub.to_csv('submission.csv', index=False)