In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.model_selection import train_test_split,cross_val_score
import seaborn
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest,chi2
warnings.filterwarnings('ignore')

data_train=pd.read_csv('C:/Users/zhouwei/Desktop/离职率预测/train.csv')
data_test=pd.read_csv('C:/Users/zhouwei/Desktop/离职率预测/test.csv')
#查看是否有缺失值
data_train.info()
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1176 entries, 0 to 1175
Data columns (total 36 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   user_id                   1176 non-null   int64 
 1   Age                       1176 non-null   int64 
 2   Attrition                 1176 non-null   object
 3   BusinessTravel            1176 non-null   object
 4   DailyRate                 1176 non-null   int64 
 5   Department                1176 non-null   object
 6   DistanceFromHome          1176 non-null   int64 
 7   Education                 1176 non-null   int64 
 8   EducationField            1176 non-null   object
 9   EmployeeCount             1176 non-null   int64 
 10  EmployeeNumber            1176 non-null   int64 
 11  EnvironmentSatisfaction   1176 non-null   int64 
 12  Gender                    1176 non-null   object
 13  HourlyRate                1176 non-null   int64 
 14  JobInvolvement          

统计出离职与部门与部门,education,title之间的关系，从分析中得出human resource部门的离职率最高，从EducationField来分析可以得出，human resourece的离职率最高，其次是Technical degree,从title role 上来分析，可以得出，sales representative 离职率最高，所以在进行文本到数值转换时，为了避免Labelencoder 进行数值转换而导致层级关系出现（比如：human resource：1，research：2，sale：3），对这三个特征进行的转换的时候使用onehotencoder，离职率最高的部门，岗位和education 设为1，其它都为0（此方法在第二次试验中尝试，当前全部使用labelencoder）

In [2]:
lis=['Department','EducationField','JobRole']
for i in lis:
    print(data_train.groupby(i)['Attrition'].value_counts()/data_train.groupby(i)['Attrition'].count())

Department              Attrition
Human Resources         No           0.792453
                        Yes          0.207547
Research & Development  No           0.863990
                        Yes          0.136010
Sales                   No           0.794872
                        Yes          0.205128
Name: Attrition, dtype: float64
EducationField    Attrition
Human Resources   No           0.695652
                  Yes          0.304348
Life Sciences     No           0.861224
                  Yes          0.138776
Marketing         No           0.766129
                  Yes          0.233871
Medical           No           0.873656
                  Yes          0.126344
Other             No           0.833333
                  Yes          0.166667
Technical Degree  No           0.747664
                  Yes          0.252336
Name: Attrition, dtype: float64
JobRole                    Attrition
Healthcare Representative  No           0.918367
                           Yes  

In [3]:
#处理非数值型的数据
model=LabelEncoder()
columns_trian=data_train.select_dtypes(exclude='int64').columns.tolist()
columns_test=data_test.select_dtypes(exclude='int64').columns.tolist()
data_train[columns_trian]=data_train[columns_trian].apply(model.fit_transform)
data_test[columns_test]=data_test[columns_test].apply(model.fit_transform)

对Age，DailyRate,MonthlyIncome,MonthlyRate,EmployeeNumber进行分组处理，使用pandas内部qcut将这几个特征的数据平均分成5个组。

In [4]:
features=['Age','DailyRate','MonthlyIncome','MonthlyRate','EmployeeNumber']
for feature in features:
    data_train[feature]=pd.qcut(data_train[feature],5,labels=[1,2,3,4,5])
    data_test[feature]=pd.qcut(data_test[feature],5,labels=[1,2,3,4,5])


In [5]:
#选取10个最相关的特征进行拟合（此方法比较简单不易于提高模型的泛化能力，第二次模型进行其他特征选择的方法）
model=SelectKBest(chi2,k=10)
#进行数据切割
Y=data_train.loc[:,'Attrition']
X=data_train.drop(columns=['user_id','Attrition'],axis=1)
model=model.fit(X,Y)
x_new=model.transform(X)
x_train,x_test,y_train,y_test=train_test_split(x_new,Y,train_size=0.8,random_state=000)
#对test集进行更新
data_test_1=data_test.drop(columns='user_id',axis=1)
new_data_test=model.transform(data_test_1)

In [7]:
#进行模型选择,使用交叉验证法进行选择出最适合的模型
models=[LogisticRegression(),DecisionTreeClassifier(),SVC(),RandomForestClassifier(),AdaBoostClassifier()]
score=dict()
name=['LR','DC','SVC','RC','AC']
for model,label in zip(models,name):
    result=cross_val_score(estimator=model,X=x_train,y=y_train,cv=10,scoring='accuracy')
    print('准确率结果:{}'.format(np.mean(result)),label)  

准确率结果:0.8531914893617021 LR
准确率结果:0.7734042553191489 DC
准确率结果:0.8361702127659575 SVC
准确率结果:0.8468085106382978 RC
准确率结果:0.85 AC


通过上述交叉验证法可以发现AdaBoostClassfier函数预测的准确性较高,进行模型模拟，并进行超参数调整。

In [8]:
param_grid={
    'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
    'n_estimators':[i for i in range(0,100,10)]
}
model=GridSearchCV(estimator=AdaBoostClassifier(random_state=000),cv=10,param_grid=param_grid,scoring='roc_auc',n_jobs=-1)

In [10]:
total=model.fit(x_train,y_train)
x_predict=new_data_test
y_predict=total.predict_proba(x_predict)[:,1]
submission=pd.DataFrame({'id':data_test.user_id,'Attrition':y_predict})