In [252]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [253]:
train_data = pd.read_csv('./train.csv') #导出训练集数据
train_data.head()

Unnamed: 0,user_id,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1374,58,No,Travel_Rarely,605,Sales,21,3,Life Sciences,1,...,3,80,1,29,2,2,1,0,0,0
1,1092,45,No,Travel_Rarely,950,Research & Development,28,3,Technical Degree,1,...,4,80,1,8,3,3,5,4,0,3
2,768,40,No,Travel_Rarely,300,Sales,26,3,Marketing,1,...,2,80,1,8,3,2,7,7,7,5
3,569,36,No,Non-Travel,1434,Sales,8,4,Life Sciences,1,...,2,80,0,10,1,3,10,7,0,9
4,911,25,Yes,Travel_Frequently,599,Sales,24,1,Life Sciences,1,...,4,80,0,1,4,3,1,0,1,0


In [254]:
test_data = pd.read_csv('./test.csv') #导出测试集数据
test_data.head()

Unnamed: 0,user_id,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,442,36,Non-Travel,635,Sales,10,4,Medical,1,592,...,4,80,0,10,3,2,10,3,9,7
1,1091,33,Travel_Rarely,575,Research & Development,25,3,Life Sciences,1,1545,...,4,80,0,5,2,3,5,3,0,2
2,981,35,Travel_Frequently,662,Sales,18,4,Marketing,1,1380,...,3,80,1,5,0,2,4,2,3,2
3,785,40,Travel_Rarely,1492,Research & Development,20,4,Technical Degree,1,1092,...,4,80,1,14,6,3,11,10,11,1
4,1332,29,Travel_Frequently,459,Research & Development,24,2,Life Sciences,1,1868,...,2,80,0,1,3,2,1,0,1,0


In [255]:
train_data.isnull().sum() #检查训练集是否存在空值

user_id                     0
Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCur

In [256]:
test_data.isnull().sum() #检查测试集是否存在空值

user_id                     0
Age                         0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [257]:
#去除‘id’，‘StandardHours’,'EmployeeNumber'这些对结果影响不大的特征
train_data = train_data.drop(columns = ['user_id','StandardHours','EmployeeNumber'])
test_data = test_data.drop(columns = ['user_id','StandardHours','EmployeeNumber'])

In [258]:
#利用Label_encoder将data中文本特征转化成编号，便于计算机理解
le = LabelEncoder()
feature_categories = ['Department','EducationField','Gender','JobRole','MaritalStatus','OverTime','BusinessTravel','Over18']
for i in feature_categories:
    train_data[i] = le.fit_transform(train_data[i])
    #如果在训练集进行了fit_transform，那么在测试集就不许进行fit;如果对测试集进行了fit，训练集和测试集的le标准就不一样了
    test_data[i] = le.transform(test_data[i])

In [259]:
train_data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,58,No,2,605,2,21,3,1,1,4,...,3,3,1,29,2,2,1,0,0,0
1,45,No,2,950,1,28,3,5,1,4,...,4,4,1,8,3,3,5,4,0,3
2,40,No,2,300,2,26,3,2,1,3,...,3,2,1,8,3,2,7,7,7,5
3,36,No,0,1434,2,8,4,1,1,1,...,3,2,0,10,1,3,10,7,0,9
4,25,Yes,1,599,2,24,1,1,1,3,...,3,4,0,1,4,3,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,34,No,2,1333,2,10,4,1,1,3,...,3,4,1,1,2,3,1,1,0,0
1172,35,No,2,528,0,8,4,5,1,3,...,3,2,0,6,2,1,5,4,1,4
1173,43,No,2,1179,2,2,3,3,1,4,...,3,1,1,10,3,3,10,9,8,8
1174,38,No,2,268,1,2,5,3,1,4,...,3,2,1,6,0,1,1,0,0,1


In [260]:
#将data中的'Attrition'作为标签单独提取出来
train_data_label = train_data['Attrition']
train_data = train_data.drop(columns = ['Attrition'])

In [261]:
#将数据做标准化处理
stander = StandardScaler()
train_data = stander.fit_transform(train_data)
train_data

array([[ 2.3389367 ,  0.595307  , -0.48557354, ..., -1.15684058,
        -0.67381506, -1.15043886],
       [ 0.9043263 ,  0.595307  ,  0.36465323, ..., -0.05481991,
        -0.67381506, -0.3083749 ],
       [ 0.35255307,  0.595307  , -1.23722329, ...,  0.7716956 ,
         1.50912298,  0.25300108],
       ...,
       [ 0.68361701,  0.595307  ,  0.92900666, ...,  1.32270594,
         1.82097127,  1.09506504],
       [ 0.13184377,  0.595307  , -1.31608491, ..., -1.15684058,
        -0.67381506, -0.86975087],
       [ 0.35255307,  0.595307  , -0.35495899, ..., -1.15684058,
        -0.67381506, -1.15043886]])

In [262]:
#将文本类型的标签进行数值转换
train_data_label = train_data_label.apply(lambda x:1 if x=='Yes' else 0)

In [263]:
#划分训练集和验证集
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(train_data,train_data_label,test_size = 0.3,random_state = 2021)

In [264]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [265]:
#将逻辑回归作为第一个分类器
clf_1 = LogisticRegression(max_iter = 100,verbose = True,random_state=2021,tol = 1e-4)
#利用训练集训练分类器
clf_1.fit(X_train,y_train)
clf_1_y_pred = clf_1.predict(X_val)
clf_1_train_score = clf_1.score(X_train,y_train)
clf_1_val_score = clf_1.score(X_val,y_val)
fpr, tpr, thresholds = roc_curve(y_val, clf_1_y_pred)
auc_score_clf_1 = auc(fpr,tpr)
print(f'训练集分数为{clf_1_train_score}')
print(f'测试集分数为{clf_1_val_score}')
print(f'AUC分数为{auc_score_clf_1}')

训练集分数为0.8797083839611178
测试集分数为0.8498583569405099
AUC分数为0.5848993288590604


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [266]:
#将SVC作为第二个分类器
clf_2 = SVC(probability=True)
#利用训练集训练分类器
clf_2.fit(X_train,y_train)
clf_2_y_pred = clf_2.predict(X_val)
clf_2_train_score = clf_2.score(X_train,y_train)
clf_2_val_score = clf_2.score(X_val,y_val)
fpr, tpr, thresholds = roc_curve(y_val, clf_2_y_pred)
auc_score_clf_2 = auc(fpr,tpr)
print(f'训练集分数为{clf_2_train_score}')
print(f'测试集分数为{clf_2_val_score}')
print(f'AUC分数为{auc_score_clf_2}')

训练集分数为0.9088699878493317
测试集分数为0.8583569405099151
AUC分数为0.5454545454545454


In [267]:
#将LinearSVC作为第三个分类器
clf_3 = LinearSVC()
#利用训练集训练分类器
clf_3.fit(X_train,y_train)
clf_3_y_pred = clf_3.predict(X_val)
clf_3_train_score = clf_3.score(X_train,y_train)
clf_3_val_score = clf_3.score(X_val,y_val)
fpr, tpr, thresholds = roc_curve(y_val, clf_3_y_pred)
auc_score_clf_3 = auc(fpr,tpr)
print(f'训练集分数为{clf_3_train_score}')
print(f'测试集分数为{clf_3_val_score}')
print(f'AUC分数为{auc_score_clf_3}')

训练集分数为0.8809234507897934
测试集分数为0.8385269121813032
AUC分数为0.548535692495424




In [268]:
#将KNeighborsClassifier作为第四个分类器
clf_4 = KNeighborsClassifier()
#利用训练集训练分类器
clf_4.fit(X_train,y_train)
clf_4_y_pred = clf_4.predict(X_val)
clf_4_train_score = clf_4.score(X_train,y_train)
clf_4_val_score = clf_4.score(X_val,y_val)
fpr, tpr, thresholds = roc_curve(y_val, clf_4_y_pred)
auc_score_clf_4 = auc(fpr,tpr)
print(f'训练集分数为{clf_4_train_score}')
print(f'测试集分数为{clf_4_val_score}')
print(f'AUC分数为{auc_score_clf_4}')

训练集分数为0.8675577156743621
测试集分数为0.8413597733711048
AUC分数为0.5131482611348382


In [272]:
#将XGBoost作为第五个分类器
clf_5 = XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=2000, 
            objective='binary:logistic', 
            subsample=0.8, colsample_bytree=0.8, 
            min_child_samples=3, eval_metric='auc', reg_lambda=0.5

)
#利用训练集训练分类器
clf_5.fit(X_train,y_train)
clf_5_y_pred = clf_5.predict(X_val)
clf_5_train_score = clf_5.score(X_train,y_train)
clf_5_val_score = clf_5.score(X_val,y_val)
fpr, tpr, thresholds = roc_curve(y_val, clf_5_y_pred)
auc_score_clf_5 = auc(fpr,tpr)
print(f'训练集分数为{clf_5_train_score}')
print(f'测试集分数为{clf_5_val_score}')
print(f'AUC分数为{auc_score_clf_5}')

Parameters: { min_child_samples } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


训练集分数为1.0
测试集分数为0.8810198300283286
AUC分数为0.6626601586333131


In [273]:
#将lGBM作为第六个分类器
clf_6 = LGBMClassifier(num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='binary',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
            n_estimators=2000, subsample=1, colsample_bytree=1,

)
#利用训练集训练分类器
clf_6.fit(X_train,y_train)
clf_6_y_pred = clf_6.predict(X_val)
clf_6_train_score = clf_6.score(X_train,y_train)
clf_6_val_score = clf_6.score(X_val,y_val)
fpr, tpr, thresholds = roc_curve(y_val, clf_6_y_pred)
auc_score_clf_6 = auc(fpr,tpr)
print(f'训练集分数为{clf_6_train_score}')
print(f'测试集分数为{clf_6_val_score}')
print(f'AUC分数为{auc_score_clf_6}')

训练集分数为1.0
测试集分数为0.8668555240793201
AUC分数为0.6172056131787674


In [243]:
#将测试集数据做标准化化处理
stander = StandardScaler()
test_data = stander.fit_transform(test_data)
test_data

array([[-0.14880903, -2.62109326, -0.43010976, ..., -0.37634728,
         2.0457762 ,  0.77575812],
       [-0.46815103,  0.56980288, -0.58254453, ..., -0.37634728,
        -0.70060829, -0.62003678],
       [-0.25525636, -1.02564519, -0.36151411, ..., -0.65505282,
         0.21485321, -0.62003678],
       ...,
       [-0.68104569,  0.56980288, -1.41077348, ...,  0.73847488,
         1.43546854,  0.77575812],
       [ 0.06408564,  0.56980288,  1.16029307, ..., -1.2124639 ,
        -0.70060829, -1.17835474],
       [ 0.27698031,  0.56980288, -1.10590393, ..., -1.2124639 ,
        -0.70060829, -1.17835474]])

In [244]:
#在测试集用SVC做预测，输出预测的概率
predict_2 = clf_2.predict_proba(test_data)[:,1]
print(predict_2)

[0.2131545  0.23006116 0.31595479 0.1217415  0.74567822 0.19120611
 0.36106101 0.08317021 0.07605271 0.29297368 0.07281704 0.40766655
 0.06129756 0.78970604 0.08384897 0.04697049 0.06027138 0.17213624
 0.06061545 0.12615636 0.36572112 0.14692551 0.03533086 0.03725799
 0.48101119 0.29145645 0.05977086 0.0464426  0.5544433  0.03842807
 0.0754571  0.07846849 0.07654457 0.14450247 0.05730396 0.03526298
 0.1262625  0.10455029 0.05773988 0.05056323 0.24371286 0.02706923
 0.06107766 0.13480499 0.05194501 0.62968839 0.30226884 0.04219584
 0.88889059 0.19699408 0.1572401  0.3089518  0.18213181 0.11153079
 0.75349808 0.1928475  0.06113235 0.22854475 0.1142617  0.20153108
 0.03707989 0.17027934 0.12411224 0.06873168 0.3110464  0.030866
 0.18461622 0.09012426 0.10330127 0.14690663 0.12611582 0.21451607
 0.05556244 0.05919881 0.08045687 0.09278171 0.10076361 0.04873993
 0.2196715  0.05135668 0.04938244 0.0797347  0.19042057 0.13469785
 0.10067119 0.08328402 0.20905943 0.1263189  0.09098777 0.099010

In [158]:
#将SVC预测的结果导出CSV格式
test_pred = pd.DataFrame(predict_2,columns = ['Attrition'])
test_pred.to_csv('./test_pred.csv')

In [221]:
#在测试集用Logictic Regression做预测，输出预测的概率
predict_1 = clf_1.predict_proba(test_data)[:,1]
print(predict_1)

[0.00000000e+000 1.00000000e+000 1.00000000e+000 4.02308969e-285
 1.00000000e+000 1.00000000e+000 6.08971743e-284 7.31919842e-280
 1.00000000e+000 0.00000000e+000 0.00000000e+000 1.00000000e+000
 4.67989366e-165 6.27975185e-159 0.00000000e+000 1.08980628e-105
 5.39401816e-097 3.43682225e-147 0.00000000e+000 1.00000000e+000
 4.01776418e-075 0.00000000e+000 6.56364796e-288 1.00000000e+000
 1.00000000e+000 1.00000000e+000 1.00000000e+000 0.00000000e+000
 1.00000000e+000 2.71990257e-285 0.00000000e+000 0.00000000e+000
 0.00000000e+000 0.00000000e+000 1.42998422e-133 0.00000000e+000
 0.00000000e+000 1.00000000e+000 0.00000000e+000 2.23715267e-194
 2.45102271e-238 4.10488636e-215 0.00000000e+000 0.00000000e+000
 0.00000000e+000 1.00000000e+000 1.00000000e+000 0.00000000e+000
 1.74411947e-025 2.55615696e-208 0.00000000e+000 1.00000000e+000
 2.79311465e-091 0.00000000e+000 1.00000000e+000 0.00000000e+000
 5.68561946e-289 1.00000000e+000 1.00000000e+000 2.55779780e-222
 3.21411267e-233 0.000000

In [211]:
#将Logistic Regression预测的结果导出CSV格式
test_pred_lr = pd.DataFrame(predict_1,columns = ['Attrition'])
test_pred_lr.to_csv('./test_pred_lr.csv')

In [245]:
#在测试集用XGB做预测，输出预测的概率
predict_5 = clf_5.predict_proba(test_data)[:,1]
print(predict_5)
# res = clf_5.predict(test_data)
# print(res)

[4.18214910e-02 3.89018038e-04 2.32681304e-01 2.04823483e-02
 9.65336144e-01 1.74189180e-01 1.31307930e-01 1.18073057e-02
 8.10163037e-04 8.05970374e-03 6.36781449e-04 1.17155248e-02
 1.40210171e-03 9.49957669e-01 7.25080958e-03 8.53640886e-05
 1.11802295e-02 1.51277738e-04 6.39563194e-03 4.76558656e-02
 2.14357063e-01 2.26026541e-03 2.11656574e-04 2.70938559e-04
 1.85569003e-01 3.77149917e-02 5.44075330e-04 6.67338958e-04
 9.80716288e-01 2.63154507e-04 1.58261554e-03 6.49965776e-04
 1.20029114e-01 2.78437901e-02 1.79994022e-04 4.10430226e-03
 1.86769906e-02 7.69690785e-04 4.52056061e-03 3.07645486e-03
 1.54135935e-03 2.06536800e-03 1.32816684e-04 3.27868224e-03
 3.48662101e-02 9.70043242e-01 2.52769664e-02 6.57006021e-05
 9.43472862e-01 1.57398030e-01 1.01594552e-01 9.27865803e-02
 2.65402850e-02 2.44393549e-03 7.61493683e-01 6.53873198e-04
 8.72326491e-05 3.91023559e-03 5.16806867e-05 2.07242101e-01
 5.85816277e-04 7.87649155e-02 2.77941639e-04 1.06620567e-03
 4.06154186e-01 1.287732

In [249]:
#将XGB预测的结果导出CSV格式
test_pred_XGB = pd.DataFrame(predict_5,columns = ['Attrition'])
test_pred_XGB.to_csv('./test_pred_XGB.csv')

In [250]:
#在测试集用LGB做预测，输出预测的概率
predict_6 = clf_6.predict_proba(test_data)[:,1]
print(predict_6)

[1.19099826e-02 6.07944970e-03 6.27736888e-02 9.35751444e-02
 8.09072314e-01 3.15830642e-01 9.74311390e-02 1.63859780e-02
 1.04989486e-02 1.20298496e-02 8.15151377e-03 2.99473019e-02
 8.59116359e-03 9.26728783e-01 2.19623082e-02 6.36893948e-04
 3.46103577e-03 3.69317688e-03 6.72452774e-03 4.77236291e-02
 6.89612678e-01 1.75451992e-02 5.60937321e-03 1.51507233e-03
 1.86145889e-01 2.79921653e-01 6.98405658e-03 9.90193896e-04
 9.24342512e-01 3.35725161e-03 4.88271387e-03 3.62795872e-03
 1.88223987e-01 3.40379144e-03 4.07203397e-03 1.83700600e-02
 3.38208789e-02 8.00764805e-03 7.54163121e-03 1.28189421e-02
 1.55059612e-02 5.90089067e-03 1.42166666e-03 8.09266783e-02
 4.71224076e-02 9.22223128e-01 1.03372584e-01 4.46682596e-03
 8.82609387e-01 6.95254561e-01 1.88415131e-01 3.07135465e-01
 4.59728873e-02 1.00225689e-02 2.98633812e-01 5.67516371e-03
 2.00392521e-03 1.52036757e-02 1.70857257e-03 7.88531180e-01
 1.71044133e-03 6.85948777e-02 6.06801161e-03 1.07547760e-02
 1.62916049e-01 1.385156

In [251]:
#将LGB预测的结果导出CSV格式
test_pred_LGB = pd.DataFrame(predict_6,columns = ['Attrition'])
test_pred_LGB.to_csv('./test_pred_LGB.csv')