In [36]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import auc, roc_curve
from sklearn.linear_model import LogisticRegression

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data.rename(columns={'uesr_id': 'user_id'}, inplace=True)
train = train_data.drop(['Attrition'], axis=1)
train_label = pd.DataFrame(train_data['Attrition'])

train['origin'] = 'train'
train.set_index('user_id', inplace=True)
test_data['origin'] = 'test'
test_data.set_index('user_id', inplace=True)
matrix = pd.concat([train, test_data])
matrix.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis=1, inplace=True)

In [4]:
lbe_label = LabelEncoder()
train_label['Attrition'] = lbe_label.fit_transform(train_label['Attrition'])

for i in ['BusinessTravel', 'Department', 'EducationField', 'JobRole']:
    lbe = LabelEncoder()
    matrix[i] = lbe.fit_transform(matrix[i])

for i in ['Gender', 'MaritalStatus', 'OverTime']:
    temp = pd.get_dummies(matrix[i], prefix=i)
    matrix = pd.concat([matrix, temp], axis=1)
    matrix.drop([i], axis=1, inplace=True)

In [5]:
train = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test = matrix[matrix['origin'] == 'test'].drop(['origin'], axis=1)
train_x, valid_x, train_y, valid_y = train_test_split(train, train_label, test_size=0.2)

In [7]:
# LR
lr = LogisticRegression()
lr.fit(train_x, train_y)
valid_pred = lr.predict(valid_x)
fpr_lr, tpr_lr, _ = roc_curve(valid_y, valid_pred)
lr_auc = auc(fpr_lr, tpr_lr)
print('auc of lr: ', lr_auc)

lr_pred = lr.predict_proba(test)

auc of lr:  0.5303030303030303


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [16]:
# XGBoost
XGB = xgb.XGBClassifier(max_depth=8, n_estimators=1000, min_child_weight=300, eta=0.3, seed=66)
XGB.fit(train_x, train_y, eval_metric='auc', eval_set=[(train_x, train_y), (valid_x, valid_y)], verbose=True, early_stopping_rounds=50)
xgb_pred = XGB.predict_proba(test)

[0]	validation_0-auc:0.5	validation_1-auc:0.5
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 50 rounds.
[1]	validation_0-auc:0.5	validation_1-auc:0.5
[2]	validation_0-auc:0.5	validation_1-auc:0.5
[3]	validation_0-auc:0.5	validation_1-auc:0.5
[4]	validation_0-auc:0.5	validation_1-auc:0.5
[5]	validation_0-auc:0.5	validation_1-auc:0.5
[6]	validation_0-auc:0.5	validation_1-auc:0.5
[7]	validation_0-auc:0.5	validation_1-auc:0.5
[8]	validation_0-auc:0.5	validation_1-auc:0.5
[9]	validation_0-auc:0.5	validation_1-auc:0.5
[10]	validation_0-auc:0.5	validation_1-auc:0.5
[11]	validation_0-auc:0.5	validation_1-auc:0.5
[12]	validation_0-auc:0.5	validation_1-auc:0.5
[13]	validation_0-auc:0.5	validation_1-auc:0.5
[14]	validation_0-auc:0.5	validation_1-auc:0.5
[15]	validation_0-auc:0.5	validation_1-auc:0.5
[16]	validation_0-auc:0.5	validation_1-auc:0.5
[17]	validation_0-auc:0.5	validation_1-auc:0.5
[18]	va

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [14]:
# LightGBM
params = {'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.01, 'max_depth': 15, 'feature_fraction': 0.8, 'bagging_fraction': 0.9,
         'bagging_freq': 8, 'lambda_l1': 0.6, 'lambda_l2': 0}
trn_data = lgb.Dataset(train_x, label=train_y)
val_data = lgb.Dataset(valid_x, label=valid_y)
LGB = lgb.train(params, trn_data, valid_sets=[trn_data, val_data], num_boost_round=10000, early_stopping_rounds=200, verbose_eval=25)
lgb_pred = LGB.predict(test)

Training until validation scores don't improve for 200 rounds
[25]	training's auc: 0.933581	valid_1's auc: 0.719361
[50]	training's auc: 0.950236	valid_1's auc: 0.701448
[75]	training's auc: 0.963016	valid_1's auc: 0.697268
[100]	training's auc: 0.969534	valid_1's auc: 0.69279
[125]	training's auc: 0.974481	valid_1's auc: 0.693536
[150]	training's auc: 0.980703	valid_1's auc: 0.68861
[175]	training's auc: 0.984878	valid_1's auc: 0.69682
[200]	training's auc: 0.988091	valid_1's auc: 0.691894
Early stopping, best iteration is:
[24]	training's auc: 0.931009	valid_1's auc: 0.721003


In [47]:
# CatBoost
cat_features = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'Gender_Female', 'Gender_Male', 'MaritalStatus_Divorced', 'MaritalStatus_Married', 
               'MaritalStatus_Single', 'OverTime_No', 'OverTime_Yes', 'Education', 'JobLevel', 'StockOptionLevel']
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
pred_skf = []
for train_index, valid_index in skf.split(train, train_label):
    CAT = CatBoostClassifier(iterations=1500, depth=7, learning_rate=0.01, loss_function='Logloss', eval_metric='AUC', bagging_temperature=0.83, rsm=0.78, 
                             od_type='Iter', od_wait=500, metric_period=100, l2_leaf_reg=5, thread_count=20, random_seed=42, logging_level='Verbose')
    CAT.fit(train.iloc[train_index, :], train_label.iloc[train_index], eval_set=(train.iloc[valid_index, :], train_label.iloc[valid_index]), cat_features=cat_features)
    cat_pred = CAT.predict_proba(test)
    pred_skf.append(cat_pred[:, 1])
pred = np.zeros(len(test))
for i in range(3):
    pred += np.log(pred_skf[i] / (1 - pred_skf[i]))
result = 1 / (1 + np.exp(-pred/3))




0:	test: 0.7031769	best: 0.7031769 (0)	total: 18.3ms	remaining: 27.5s
100:	test: 0.7950635	best: 0.7950635 (100)	total: 1.29s	remaining: 17.9s
200:	test: 0.7996090	best: 0.8007820 (190)	total: 2.84s	remaining: 18.4s
300:	test: 0.8053763	best: 0.8061095 (268)	total: 4.4s	remaining: 17.5s
400:	test: 0.8053275	best: 0.8061095 (268)	total: 5.87s	remaining: 16.1s
500:	test: 0.8068426	best: 0.8084066 (486)	total: 7.42s	remaining: 14.8s
600:	test: 0.8080645	best: 0.8086999 (520)	total: 9.02s	remaining: 13.5s
700:	test: 0.8092375	best: 0.8103617 (676)	total: 10.6s	remaining: 12.1s
800:	test: 0.8087488	best: 0.8104594 (738)	total: 12.3s	remaining: 10.7s
900:	test: 0.8087977	best: 0.8104594 (738)	total: 14.1s	remaining: 9.39s
1000:	test: 0.8057674	best: 0.8104594 (738)	total: 15.9s	remaining: 7.94s
1100:	test: 0.8057674	best: 0.8104594 (738)	total: 17.7s	remaining: 6.43s
1200:	test: 0.8068915	best: 0.8104594 (738)	total: 19.6s	remaining: 4.87s
Stopped by overfitting detector  (500 iterations wai



100:	test: 0.8155063	best: 0.8155063 (100)	total: 1.4s	remaining: 19.4s
200:	test: 0.8310899	best: 0.8318136 (196)	total: 2.96s	remaining: 19.1s
300:	test: 0.8386163	best: 0.8386645 (292)	total: 4.67s	remaining: 18.6s
400:	test: 0.8408356	best: 0.8414146 (398)	total: 6.34s	remaining: 17.4s
500:	test: 0.8437786	best: 0.8441164 (478)	total: 7.86s	remaining: 15.7s
600:	test: 0.8476866	best: 0.8480243 (591)	total: 9.43s	remaining: 14.1s
700:	test: 0.8484585	best: 0.8491340 (612)	total: 11.1s	remaining: 12.7s
800:	test: 0.8513051	best: 0.8521252 (797)	total: 12.9s	remaining: 11.2s
900:	test: 0.8526077	best: 0.8527042 (896)	total: 14.6s	remaining: 9.73s
1000:	test: 0.8505331	best: 0.8531384 (907)	total: 16.5s	remaining: 8.21s
1100:	test: 0.8495199	best: 0.8531384 (907)	total: 18.3s	remaining: 6.63s
1200:	test: 0.8490857	best: 0.8531384 (907)	total: 20.2s	remaining: 5.02s
1300:	test: 0.8499059	best: 0.8531384 (907)	total: 22s	remaining: 3.36s
1400:	test: 0.8505814	best: 0.8531384 (907)	total:



0:	test: 0.6871231	best: 0.6871231 (0)	total: 16.8ms	remaining: 25.2s
100:	test: 0.8339847	best: 0.8351908 (91)	total: 1.37s	remaining: 18.9s
200:	test: 0.8331645	best: 0.8361557 (114)	total: 2.91s	remaining: 18.8s
300:	test: 0.8332127	best: 0.8365417 (251)	total: 4.56s	remaining: 18.2s
400:	test: 0.8360110	best: 0.8365417 (251)	total: 6.3s	remaining: 17.3s
500:	test: 0.8361075	best: 0.8365417 (251)	total: 8.05s	remaining: 16s
600:	test: 0.8352873	best: 0.8374584 (514)	total: 9.73s	remaining: 14.6s
700:	test: 0.8334539	best: 0.8374584 (514)	total: 11.4s	remaining: 13s
800:	test: 0.8344671	best: 0.8374584 (514)	total: 13.1s	remaining: 11.4s
900:	test: 0.8333092	best: 0.8374584 (514)	total: 14.7s	remaining: 9.81s
1000:	test: 0.8324890	best: 0.8374584 (514)	total: 16.5s	remaining: 8.23s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8374583876
bestIteration = 514

Shrink model to first 515 iterations.


In [48]:
submission = pd.DataFrame(test.reset_index()['user_id'])
submission['Attrition'] = pd.Series(result)

In [49]:
submission.to_csv('prediction.csv', index=False)

Unnamed: 0,user_id,Attrition
0,442,0.120881
1,1091,0.092034
2,981,0.170785
3,785,0.128829
4,1332,0.608001
...,...,...
289,1439,0.101513
290,481,0.189513
291,124,0.256852
292,198,0.092692
