In [1]:
import numpy as np
import pandas as pd
import sklearn.naive_bayes as NB
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [38]:
ds = pd.read_csv('D:\\Dataset\\EE655000MachineLearning\\Aidea\\train.csv')
test_ds = pd.read_csv('D:\\Dataset\\EE655000MachineLearning\\Aidea\\test.csv')
print(len(train_ds))
print(len(test_ds))

11455
3739


In [39]:
ds.isna().sum()

yyyy            0
PerNo           0
PerStatus       0
sex            73
工作分類           73
職等             73
廠區代碼           73
管理層級           73
工作資歷1          73
工作資歷2          73
工作資歷3          73
工作資歷4          73
工作資歷5          73
專案時數           73
專案總數           73
當前專案角色         73
特殊專案佔比         73
工作地點           73
訓練時數A          73
訓練時數B          73
訓練時數C          73
生產總額           73
榮譽數            73
是否升遷           73
升遷速度           73
近三月請假數A        73
近一年請假數A        73
近三月請假數B        73
近一年請假數B        73
出差數A           73
出差數B           73
出差集中度          73
年度績效等級A        73
年度績效等級B        73
年度績效等級C        73
年齡層級           73
婚姻狀況           73
年資層級A          73
年資層級B          73
年資層級C          73
任職前工作平均年數      73
最高學歷         5326
畢業學校類別       3841
畢業科系類別         73
眷屬量            73
通勤成本           73
歸屬部門           73
dtype: int64

In [40]:
columns = list(ds.columns)
columns.remove('最高學歷')
columns.remove('畢業學校類別')
ds = ds.loc[:,columns]
test_ds = test_ds.loc[:,columns]
test_ds.drop(columns='PerStatus', inplace=True)

In [71]:
ds.dropna(inplace=True)
test_ds_valid = test_ds.dropna().copy()
test_ds_invalid = test_ds[test_ds.isna().any(axis=1)].copy()
print(len(ds))

14319


In [109]:
test_ds_invalid # 18 IDs has no valid data in test.csv

Unnamed: 0,yyyy,PerNo,sex,工作分類,職等,廠區代碼,管理層級,工作資歷1,工作資歷2,工作資歷3,...,年齡層級,婚姻狀況,年資層級A,年資層級B,年資層級C,任職前工作平均年數,畢業科系類別,眷屬量,通勤成本,歸屬部門
34,2018,87,,,,,,,,,...,,,,,,,,,,
101,2018,276,,,,,,,,,...,,,,,,,,,,
206,2018,535,,,,,,,,,...,,,,,,,,,,
311,2018,785,,,,,,,,,...,,,,,,,,,,
447,2018,1075,,,,,,,,,...,,,,,,,,,,
951,2018,2317,,,,,,,,,...,,,,,,,,,,
1288,2018,3109,,,,,,,,,...,,,,,,,,,,
1830,2018,4324,,,,,,,,,...,,,,,,,,,,
1925,2018,4537,,,,,,,,,...,,,,,,,,,,
2051,2018,4831,,,,,,,,,...,,,,,,,,,,


In [52]:
ds.PerStatus.value_counts() # label distribution

0    13526
1      793
Name: PerStatus, dtype: int64

In [53]:
train_ds, val_ds = train_test_split(ds, test_size=0.2, random_state=42)

In [54]:
num_feats = ['職等',
             '管理層級',
             '專案時數',
             '專案總數',
             '特殊專案佔比',
             '訓練時數A',
             '訓練時數B',
             '訓練時數C',
             '生產總額',
             '榮譽數',
             '升遷速度',
             '近三月請假數A',
             '近一年請假數A',
             '近三月請假數B',
             '近一年請假數B',
             '出差數A',
             '出差數B',
             '出差集中度', # un sure
             '年度績效等級A',
             '年度績效等級B',
             '年度績效等級C',
             '年齡層級',
             '年資層級A',
             '年資層級B',
             '年資層級C',
             '任職前工作平均年數',
             '眷屬量',
             '通勤成本',
             ]
cat_feats = ['sex',
             '工作分類',
             '廠區代碼',
             '工作資歷1',
             '工作資歷2',
             '工作資歷3',
             '工作資歷4',
             '工作資歷5',
             '當前專案角色',
             '工作地點',
             '是否升遷',
             '婚姻狀況',
             '畢業科系類別',
             '歸屬部門',
             ]

In [72]:
X_train_num = train_ds.loc[:,num_feats].values
X_train_cat = train_ds.loc[:,cat_feats].values
X_train_all = train_ds.iloc[:,3:]
y_train = train_ds.PerStatus.values

X_val_num = val_ds.loc[:,num_feats].values
X_val_cat = val_ds.loc[:,cat_feats].values
X_val_all = val_ds.iloc[:,3:]
y_val = val_ds.PerStatus.values

X_test_num = test_ds_valid.loc[:,num_feats].values
X_test_cat = test_ds_valid.loc[:,cat_feats].values
X_test_all = test_ds_valid.iloc[:,3:]

In [117]:
def F_beta_score(y_true, y_pred, beta=1.5):
    """F beta score with beta=1.5"""
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f_beta = (1+beta**2)*(prec*rec)/(beta**2*prec + rec)
    return f_beta

def fit_and_predict(model, X_train, y_trian, X_val, y_val):
    """Fast apply of an classifier on training and validation set"""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(f'Beta F1 score: {F_beta_score(y_val, y_pred):.4f}')
    return y_pred

def make_submission(model, X_train, y_train, X_test, test_ds_valid, test_ds_invalid, test_ds):
    """output valid submission model"""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)[:,np.newaxis]
    output_no = test_ds_valid.loc[:,'PerNo'].values[:,np.newaxis]
    output_valid = np.hstack((output_no, y_pred))
    print(output_valid.shape)
    
    # for 18 IDs with no valid data, apply random predict
    per_invalid = test_ds_invalid.PerNo.values[:,np.newaxis]
    y_pred_invalid = np.random.rand(len(per_invalid))
    y_pred_invalid = np.array([1 if y_<0.05 else 0 for y_ in y_pred_invalid])[:,np.newaxis]
    output_invalid = np.hstack((per_invalid, y_pred_invalid))
    print(output_invalid.shape)
    
    output = np.vstack((output_valid,output_invalid))
    output_ds = pd.DataFrame(output, columns=['PerNo', 'PerStatus'])
    
    output_ds = output_ds.set_index('PerNo')
    output_ds = output_ds.reindex(index=test_ds['PerNo'])
    output_ds = output_ds.reset_index()
    return output_ds

In [12]:
mnb = NB.MultinomialNB()
y_pred = fit_and_predict(mnb, X_train_all, y_train, X_val_all, y_val)

Beta F1 score: 0.2129


In [13]:
rf = RandomForestClassifier(n_estimators=50, max_depth=5, class_weight='balanced')
y_pred = fit_and_predict(rf, X_train_all, y_train, X_val_all, y_val)

Beta F1 score: 0.2395


In [69]:
sc = StandardScaler()
sc.fit(X_train_num)
X_train_num_std = sc.transform(X_train_num)
X_val_num_std = sc.transform(X_val_num)
svc = SVC(C=1e-1, class_weight='balanced')
y_pred = fit_and_predict(svc, X_train_num_std, y_train, X_val_num_std, y_val)

Beta F1 score: 0.2460


In [115]:
sc = StandardScaler()
X_train_ = np.vstack((X_train_num, X_val_num))
y_train_ = np.vstack((y_train[:,np.newaxis], y_val[:,np.newaxis]))
X_test_ = X_test_num
sc.fit(X_train_)
X_train_std_ = sc.transform(X_train_)
X_test_std_ = sc.transform(X_test_)

svc = SVC(C=1e-1, class_weight='balanced')
submission = make_submission(svc, X_train_std_, y_train_.ravel(), X_test_std_, test_ds_valid, test_ds_invalid, test_ds)

(3721, 2)
(18, 2)


In [116]:
submission

Unnamed: 0,PerNo,PerStatus
0,1,0
1,3,0
2,7,1
3,15,0
4,16,1
...,...,...
3734,8761,1
3735,8765,0
3736,8767,1
3737,8774,0


In [108]:
submission.to_csv('./submission_svc.csv', index=False)

In [112]:
xgb_cat = xgb.XGBClassifier(enable_categorical=True, use_label_encoder=False)
y_pred = fit_and_predict(xgb_cat, X_train_cat, y_train, X_val_cat, y_val)

Parameters: { "enable_categorical" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Beta F1 score: 0.0785
