In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
cols = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split
def stacking(model, train_data, train_target, test_data, n_fold):
    """
    :param model:  模型算法
    :param train_data:  训练集(不含带预测的目标特征)
    :param train_target:  需要预测的目标特征
    :param test_data:   测试集
    :param n_fold:   交叉验证的折数
    :return:
    """
    skf = StratifiedKFold(n_splits=n_fold, random_state=1)  # StratifiedKFold 默认分层采样
    train_pred = np.zeros((train_data.shape[0], 1), int)   # 存储训练集预测结果
    test_pred = np.zeros((test_data.shape[0], 1), int)  # 存储测试集预测结果 行数：len(test_data) ,列数：1列
    for skf_index, (train_index, val_index) in enumerate(skf.split(train_data, train_target)):
        print('第 ', skf_index+1, ' 折交叉验证开始... ')
        # 训练集划分
        x_train, x_val = train_data.iloc[train_index], train_data.iloc[val_index]
        y_train, y_val = train_target.iloc[train_index], train_target.iloc[val_index]
        # 模型构建
        y_train = np.ravel(y_train)   # 向量转成数组
        model.fit(X=x_train, y=y_train)
        # 模型预测
        accs = accuracy_score(y_val, model.predict(x_val))
        print('第 ', skf_index+1, ' 折交叉验证 :  accuracy ： ', accs)
 
        # 训练集预测结果
        val_pred = model.predict_proba(x_val)[:, 1]
        for i in range(len(val_index)):
            train_pred[val_index[i]] = val_pred[i]
        # 保存测试集预测结果
        test_pred = np.column_stack((test_pred, model.predict_proba(test_data)[:, 1]))  # 将矩阵按列合并
 
    test_pred_mean = np.mean(test_pred, axis=1)  # 按行计算均值(会出现小数)
    test_pred_mean = pd.DataFrame(test_pred_mean)   # 转成DataFrame
#     test_pred_mean = test_pred_mean.apply(lambda x: round(x))  # 小数需要四舍五入成整数
    return np.ravel(test_pred_mean), train_pred

In [4]:


from sklearn.linear_model.logistic import LogisticRegression

train_lr = train.copy(deep=True)
test_lr = test.copy(deep=True)

train_lr['Attrition'] = train_lr['Attrition'].map(lambda x: 1 if x=='Yes' else 0)
test_lr['Attrition'] = -1
test_lr = test_lr[train_lr.columns]
data = pd.concat([train_lr, test_lr])

data = data.drop(['EmployeeNumber', 'EmployeeCount', 'StandardHours'], axis=1)

for attr in cols:
    new_attr = pd.DataFrame()
    new_attr = pd.get_dummies(data[attr],prefix=attr)
    data = pd.concat([data, new_attr], axis=1)
    data = data.drop([attr], axis=1)
train_lr = data[data['Attrition'] != -1]
test_lr = data[data['Attrition'] == -1]
test_lr = test_lr.drop('Attrition', axis=1)

# 归一化数值比较大的向量
Min_max_cols=['TotalWorkingYears', 'Age', 'MonthlyIncome', 'DailyRate', 'DistanceFromHome', 'HourlyRate']
for col in Min_max_cols:
    min_max = MinMaxScaler()
    train_lr[col] = min_max.fit_transform(train_lr[col].values.reshape(-1, 1))
    test_lr[col] = min_max.transform(test_lr[col].values.reshape(-1, 1))

# 采用LR模型
model_lr = LogisticRegression(max_iter=100, verbose=True, random_state=33, tol=1e-4)
print('==============================logistic regression==============================')
lr_test_pred, lr_train_pred = stacking(model=model_lr, train_data=train_lr.drop(['user_id', 'Attrition'], axis=1), train_target=train_lr['Attrition'], test_data=test_lr.drop('user_id', axis=1), n_fold=5)


# model_lr.fit(train_lr.drop(['user_id', 'Attrition'], axis=1), train_lr['Attrition'])

# predict_lr = model_lr.predict_proba(test_lr.drop('user_id', axis=1))[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

第  1  折交叉验证开始... 
第  1  折交叉验证 :  accuracy ：  0.8389830508474576
第  2  折交叉验证开始... 
第  2  折交叉验证 :  accuracy ：  0.8595744680851064
第  3  折交叉验证开始... 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITE

第  3  折交叉验证 :  accuracy ：  0.8638297872340426
第  4  折交叉验证开始... 
第  4  折交叉验证 :  accuracy ：  0.8425531914893617
第  5  折交叉验证开始... 
第  5  折交叉验证 :  accuracy ：  0.8680851063829788


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


In [11]:

import catboost as cb
# catboost的分类
train_cb = train.copy(deep=True)
test_cb = test.copy(deep=True)

train_cb = train_cb.drop(['user_id', 'EmployeeCount', 'EmployeeNumber', 'StandardHours'], axis=1)
test_cb = test_cb.drop(['EmployeeCount', 'EmployeeNumber', 'StandardHours'], axis=1)
train_cb['Attrition'] = train_cb['Attrition'].map(lambda x:1 if x=='Yes' else 0)

x_train, x_valid, y_train, y_valid = train_test_split(train_cb.drop('Attrition', axis=1), train_cb['Attrition'], test_size=0.2, random_state=40)
print(train_cb.columns)
model_cb = cb.CatBoostClassifier(iterations=1000,
                                         depth=7,
                                         learning_rate=0.01,
                                         loss_function='Logloss',
                                         eval_metric='AUC',
                                         logging_level='Verbose',
                                         metric_period=50)



# 得到分类特征的列号
categorical_features_indices = []
for i in range(len(x_train.columns)):
    if x_train.columns.values[i] in cols:
        categorical_features_indices.append(i)
print(categorical_features_indices)

print('==============================catboost==============================')
# cb_test_pred, cb_train_pred = stacking(model=model_cb, train_data=x_train, train_target=y_train, test_data=test_cb.drop('user_id', axis=1), n_fold=5)

model_cb.fit(x_train, y_train, eval_set=(x_valid, y_valid), cat_features=categorical_features_indices)

predict_cb = model_cb.predict_proba(test_cb.drop('user_id', axis=1))[:, 1]


Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')
[1, 3, 6, 8, 12, 14, 18, 19]
0:	test: 0.5807292	best: 0.5807292 (0)	total: 111ms	remaining: 1m 51s
50:	test: 0.8094363	best: 0.8094363 (50)	total: 1.06s	remaining: 19.8s
100:	test: 0.8187806	best: 0.8187806 (100)	total: 2.14s	remaining: 19.1s
150:	test: 0.8123468	best: 0.8187806 (100)	total: 3.05s	remaining: 17.1s
200:	test: 0.8121936	best

In [12]:
# GradientBoostingRegressor
# from sklearn.ensemble import GradientBoostingClassifier

train_gb = train.copy(deep=True)
test_gb = test.copy(deep=True)

test_gb['Attrition'] = -1
test_gb = test_gb[train_gb.columns]
data = pd.concat([train_gb, test_gb])

data = data.drop(['EmployeeNumber', 'EmployeeCount', 'StandardHours'], axis=1)

train_gb = data[data['Attrition'] != -1]
test_gb = data[data['Attrition'] == -1]
test_gb = test_gb.drop('Attrition', axis=1)
train_gb['Attrition'] = train_gb['Attrition'].map(lambda x: 1 if x=='Yes' else 0) 
train_gb = train_gb.drop('user_id', axis=1)
from sklearn.preprocessing import LabelEncoder
lbe_list = []
for col in cols:
    lbe = LabelEncoder()
    train_gb[col] = lbe.fit_transform(train_gb[col])
    test_gb[col] = lbe.transform(test_gb[col])
    lbe_list.append(lbe)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split



# 三个初级学习器进行初级训练
# 随机森林算法进行训练
rf = RandomForestClassifier(n_jobs=-1, max_depth=100, n_estimators=800)
print('==============================随机森林模型==============================')
rf_test_pred, rf_train_pred = stacking(model=rf, train_data=train_gb.drop('Attrition', axis=1), train_target=train_gb['Attrition'], test_data=test_gb.drop('user_id', axis=1), n_fold=5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


第  1  折交叉验证开始... 
第  1  折交叉验证 :  accuracy ：  0.8771186440677966
第  2  折交叉验证开始... 
第  2  折交叉验证 :  accuracy ：  0.8680851063829788
第  3  折交叉验证开始... 
第  3  折交叉验证 :  accuracy ：  0.8468085106382979
第  4  折交叉验证开始... 
第  4  折交叉验证 :  accuracy ：  0.8638297872340426
第  5  折交叉验证开始... 
第  5  折交叉验证 :  accuracy ：  0.8425531914893617


In [16]:
predict = ( 0.4*lr_test_pred + 0.4*predict_cb + 0.2*rf_test_pred)

submission = pd.DataFrame({'user_id': test['user_id'], 'Attrition': predict})
submission.to_csv("submission_test2.csv", index=False, sep=',', columns=['user_id', 'Attrition'])

In [15]:
train_gb['Attrition'].value_counts()

0    988
1    188
Name: Attrition, dtype: int64

In [4]:
train.columns

Index(['user_id', 'Age', 'Attrition', 'BusinessTravel', 'DailyRate',
       'Department', 'DistanceFromHome', 'Education', 'EducationField',
       'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender',
       'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate',
       'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [17]:
test_gb.columns

Index(['user_id', 'Age', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'Attrition'],
      dtype='object')