In [100]:
import numpy as np
import pandas as pd

In [101]:
import catboost
from catboost import cv, Pool, CatBoostClassifier

In [102]:
data_folder = '../input/uet-hackathon-2022-data-science/'

In [103]:
wt = pd.read_csv(data_folder + 'work_train.csv')
it = pd.read_csv(data_folder + 'info_train.csv')
lbt = pd.read_csv(data_folder + 'label_train.csv')

wte = pd.read_csv(data_folder + 'work_test.csv')
ite = pd.read_csv(data_folder + 'info_test.csv')

In [104]:
fts = pd.merge(left=wt, right=it, how='left', on='id_bh')
df = pd.merge(left=fts, right=lbt, how='left', on='id_bh')

In [105]:
df['address'] = df['address_x'].combine_first(df['address_y'])

In [106]:
df['lastest'] = df['to_date'].astype('string').str.slice(0, 4)

In [107]:
df.isna().sum() / df.shape[0]

In [108]:
df = df[~df['job/role'].isna()]

In [109]:
df_filter = df.sort_values(
    by=['id_bh', 'to_date', 'from_date'], 
    ascending=[True, False, False]
).drop_duplicates(
    subset=['id_bh'], 
    keep='first', 
    ignore_index=True
)

In [110]:
df_filter['age'] = 2022 - df_filter['bithYear'].astype(int)

In [111]:
df_filter['delay'] = 2022-df_filter['lastest'].astype(int)

In [112]:
fts_name = ['id_management', 'id_office', 'company_type', 'employee_lv', 'gender', 'age', 'delay']
cat_fts_idx = [0, 1, 2, 4]

In [113]:
df_filter['id_office'].fillna('Unknow', inplace=True)
df_filter['address_x'].fillna('Việt Nam', inplace=True)
df_filter['address_y'].fillna('Việt Nam', inplace=True)

In [114]:
cv_data = Pool(data=df_filter[fts_name], label=df_filter['label'], cat_features=cat_fts_idx)

In [115]:
params = {"iterations": 500,
          'random_seed': 42,
          'bagging_temperature': 0.1,
          'l2_leaf_reg': 10,
          'leaf_estimation_iterations': 5,
          "loss_function": "MultiClass", 
          'custom_metric': ['TotalF1:average=Macro', 'TotalF1:average=Weighted'], 
          'task_type': 'GPU'}

scores, models = cv(cv_data, params, fold_count=5, return_models=True)

In [116]:
scores[[col for col in scores.columns if 'test' in col]+['iterations']]

In [117]:
df_test = pd.merge(left=wte, right=ite, how='left', on='id_bh')

In [118]:
df_test['address'] = df_test['address_x'].combine_first(df_test['address_y'])

In [119]:
df_test['lastest'] = df_test['to_date'].astype('string').str.slice(0, 4)

In [120]:
df_filter_test = df_test.sort_values(
    by=['id_bh', 'to_date', 'from_date'], 
    ascending=[True, False, False]
).drop_duplicates(
    subset=['id_bh'], 
    keep='first', 
    ignore_index=True
)

In [121]:
df_filter_test['age'] = 2022 - df_filter_test['bithYear'].astype(int)

In [122]:
df_filter_test['delay'] = 2022-df_filter_test['lastest'].astype(int)

In [123]:
df_filter_test['id_office'].fillna('unknow', inplace=True)
df_filter_test['address_x'].fillna("việt nam")
df_filter_test['address_y'].fillna("việt nam")

In [124]:
pred_probs_1 = models[0].predict(df_filter_test[fts_name], prediction_type='Probability')
pred_probs_2 = models[1].predict(df_filter_test[fts_name], prediction_type='Probability')
pred_probs_3 = models[2].predict(df_filter_test[fts_name], prediction_type='Probability')
pred_probs_4 = models[3].predict(df_filter_test[fts_name], prediction_type='Probability')
pred_probs_5 = models[4].predict(df_filter_test[fts_name], prediction_type='Probability')

In [125]:
pred_probs = (pred_probs_1+pred_probs_2+pred_probs_3+pred_probs_4+pred_probs_5)/5

In [126]:
df_filter_test['label'] = (pred_probs.argmax(axis=1)+1)

In [127]:
submit = pd.read_csv('../input/uet-hackathon-2022-data-science/label_test.csv')

In [128]:
res = df_filter_test[['id_bh', 'label']]

In [129]:
submit_df = pd.merge(submit, res, 'inner', 'id_bh')
submit_df.to_csv('submission.csv', index=False)