In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
cols = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']

test['Attrition'] = -1
test = test[train.columns]
data = pd.concat([train, test])

data = data.drop(['EmployeeNumber', 'EmployeeCount', 'StandardHours'], axis=1)

train = data[data['Attrition'] != -1]
test = data[data['Attrition'] == -1]
test = test.drop('Attrition', axis=1)
train['Attrition'] = train['Attrition'].map(lambda x: 1 if x=='Yes' else 0) 
train = train.drop('user_id', axis=1)
from sklearn.preprocessing import LabelEncoder
lbe_list = []
for col in cols:
    lbe = LabelEncoder()
    train[col] = lbe.fit_transform(train[col])
    test[col] = lbe.transform(test[col])
    lbe_list.append(lbe)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split

def stacking(model, train_data, train_target, test_data, n_fold):
    """
    :param model:  模型算法
    :param train_data:  训练集(不含带预测的目标特征)
    :param train_target:  需要预测的目标特征
    :param test_data:   测试集
    :param n_fold:   交叉验证的折数
    :return:
    """
    skf = StratifiedKFold(n_splits=n_fold, random_state=1)  # StratifiedKFold 默认分层采样
    train_pred = np.zeros((train_data.shape[0], 1), int)   # 存储训练集预测结果
    test_pred = np.zeros((test_data.shape[0], 1), int)  # 存储测试集预测结果 行数：len(test_data) ,列数：1列
    for skf_index, (train_index, val_index) in enumerate(skf.split(train_data, train_target)):
        print('第 ', skf_index+1, ' 折交叉验证开始... ')
        # 训练集划分
        x_train, x_val = train_data.iloc[train_index], train_data.iloc[val_index]
        y_train, y_val = train_target.iloc[train_index], train_target.iloc[val_index]
        # 模型构建
        y_train = np.ravel(y_train)   # 向量转成数组
        model.fit(X=x_train, y=y_train)
        # 模型预测
        accs = accuracy_score(y_val, model.predict(x_val))
        print('第 ', skf_index+1, ' 折交叉验证 :  accuracy ： ', accs)
 
        # 训练集预测结果
        val_pred = model.predict_proba(x_val)[:, 1]
        for i in range(len(val_index)):
            train_pred[val_index[i]] = val_pred[i]
        # 保存测试集预测结果
        test_pred = np.column_stack((test_pred, model.predict_proba(test_data)[:, 1]))  # 将矩阵按列合并
 
    test_pred_mean = np.mean(test_pred, axis=1)  # 按行计算均值(会出现小数)
    test_pred_mean = pd.DataFrame(test_pred_mean)   # 转成DataFrame
#     test_pred_mean = test_pred_mean.apply(lambda x: round(x))  # 小数需要四舍五入成整数
    return np.ravel(test_pred_mean), train_pred


In [6]:

 
import lightgbm as lgb


# 三个初级学习器进行初级训练
# 随机森林算法进行训练
rf = RandomForestClassifier(n_jobs=-1, max_depth=100, n_estimators=800)
print('==============================随机森林模型==============================')
rf_test_pred, rf_train_pred = stacking(model=rf, train_data=train.drop('Attrition', axis=1), train_target=train['Attrition'], test_data=test.drop('user_id', axis=1), n_fold=5)
rf_test_pred = pd.DataFrame(rf_test_pred)
rf_train_pred = pd.DataFrame(rf_train_pred)

# 用决策树算法进行训练
dt = DecisionTreeClassifier(random_state=1)
print('==============================决策树模型==============================')
dt_test_pred, dt_train_pred = stacking(model=dt, train_data=train.drop('Attrition', axis=1), train_target=train['Attrition'], test_data=test.drop('user_id', axis=1), n_fold=5)
dt_test_pred = pd.DataFrame(dt_test_pred)
dt_train_pred = pd.DataFrame(dt_train_pred)

# 用K近邻算法进行训练
knn = KNeighborsClassifier()
print('==============================K近邻模型==============================')
knn_test_pred, knn_train_pred = stacking(model=knn, train_data=train.drop('Attrition', axis=1), train_target=train['Attrition'], test_data=test.drop('user_id', axis=1), n_fold=5)
knn_test_pred = pd.DataFrame(knn_test_pred)
knn_train_pred = pd.DataFrame(knn_train_pred)

# rf_train_pred,dt_train_pred,knn_train_pred 合并生成次级训练集 train_set
# rf_test_pred,dt_test_pred,knn_test_pred 合并生成次级测试集集 test_set
train_set = pd.concat([rf_train_pred, dt_train_pred, knn_train_pred],  axis=1)
test_set = pd.concat([rf_test_pred, dt_test_pred, knn_test_pred],  axis=1)



第  1  折交叉验证开始... 




第  1  折交叉验证 :  accuracy ：  0.8771186440677966
第  2  折交叉验证开始... 
第  2  折交叉验证 :  accuracy ：  0.8680851063829788
第  3  折交叉验证开始... 
第  3  折交叉验证 :  accuracy ：  0.8425531914893617
第  4  折交叉验证开始... 
第  4  折交叉验证 :  accuracy ：  0.8638297872340426
第  5  折交叉验证开始... 
第  5  折交叉验证 :  accuracy ：  0.8425531914893617
第  1  折交叉验证开始... 
第  1  折交叉验证 :  accuracy ：  0.7796610169491526
第  2  折交叉验证开始... 
第  2  折交叉验证 :  accuracy ：  0.7829787234042553
第  3  折交叉验证开始... 
第  3  折交叉验证 :  accuracy ：  0.7531914893617021
第  4  折交叉验证开始... 
第  4  折交叉验证 :  accuracy ：  0.8085106382978723
第  5  折交叉验证开始... 
第  5  折交叉验证 :  accuracy ：  0.7531914893617021
第  1  折交叉验证开始... 
第  1  折交叉验证 :  accuracy ：  0.8177966101694916
第  2  折交叉验证开始... 
第  2  折交叉验证 :  accuracy ：  0.825531914893617
第  3  折交叉验证开始... 
第  3  折交叉验证 :  accuracy ：  0.825531914893617
第  4  折交叉验证开始... 




第  4  折交叉验证 :  accuracy ：  0.8468085106382979
第  5  折交叉验证开始... 
第  5  折交叉验证 :  accuracy ：  0.8127659574468085


In [9]:
test_pred2

array([0.30731849, 0.11206836, 0.11206836, 0.29342004, 0.76154912,
       0.30731849, 0.23344356, 0.11206836, 0.23344356, 0.11206836,
       0.11206836, 0.11206836, 0.11206836, 0.76154912, 0.23344356,
       0.30731849, 0.23344356, 0.23344356, 0.23344356, 0.11206836,
       0.69237407, 0.11206836, 0.11206836, 0.11206836, 0.23344356,
       0.23344356, 0.11206836, 0.11206836, 0.76154912, 0.11206836,
       0.11206836, 0.11206836, 0.23344356, 0.23344356, 0.11206836,
       0.11206836, 0.11206836, 0.11206836, 0.11206836, 0.11206836,
       0.11206836, 0.11206836, 0.11206836, 0.23344356, 0.23344356,
       0.23344356, 0.29342004, 0.11206836, 0.69237407, 0.76154912,
       0.11206836, 0.23344356, 0.11206836, 0.23344356, 0.29342004,
       0.11206836, 0.11206836, 0.11206836, 0.11206836, 0.23344356,
       0.11206836, 0.11206836, 0.11206836, 0.23344356, 0.23344356,
       0.11206836, 0.11206836, 0.11206836, 0.11206836, 0.23344356,
       0.11206836, 0.23344356, 0.23344356, 0.23344356, 0.11206

In [7]:
param = {'boosting_type':'gbdt',
                         'objective' : 'binary', #
                         #'metric' : 'binary_logloss',
                         'metric' : 'auc',
#                          'metric' : 'self_metric',
                         'learning_rate' : 0.01,
                         'max_depth' : 15,
                         'feature_fraction':0.8,
                         'bagging_fraction': 0.9,
                         'bagging_freq': 8,
                         'lambda_l1': 0.6,
                         'lambda_l2': 0,
#                          'scale_pos_weight':k,
#                         'is_unbalance':True
        }


# lightgbm作为次级学习器进行训练
# lgb = lgb.LGBMClassifier(
#     objective='multiclass',
#     boosting_type='gbdt',
#     n_estimators=2000,
#     subsample=0.8,
#     learning_rate=0.1,
#     n_jobs=4,
#     reg_alpha=0.1,
#     reg_lambda=0.1,
#     num_leaves=55
# )
# y_train = np.ravel(train['Attrition'])  # 向量转成数组
clf = lgb.LGBMClassifier(is_unbalanced = False, silent=False)
clf.fit(train_set, train['Attrition'])
test_pred = clf.predict_proba(test_set)[:, 1]

In [8]:
test_pred2 = clf.predict_proba(test_set)[:, 1]

In [19]:
submission = pd.DataFrame({'user_id': test['user_id'], 'Attrition': rf_test})
submission.to_csv("submission_random_forest.csv", index=False, sep=',', columns=['user_id', 'Attrition'])

In [9]:
test_pred

array([0.13061303, 0.13061303, 0.13061303, 0.30611866, 0.30611866,
       0.13061303, 0.30611866, 0.13061303, 0.30611866, 0.13061303,
       0.13061303, 0.13061303, 0.13061303, 0.30611866, 0.30611866,
       0.13061303, 0.30611866, 0.30611866, 0.30611866, 0.13061303,
       0.30611866, 0.13061303, 0.13061303, 0.13061303, 0.30611866,
       0.30611866, 0.13061303, 0.13061303, 0.30611866, 0.13061303,
       0.13061303, 0.13061303, 0.30611866, 0.30611866, 0.13061303,
       0.13061303, 0.13061303, 0.13061303, 0.13061303, 0.13061303,
       0.13061303, 0.13061303, 0.13061303, 0.30611866, 0.30611866,
       0.30611866, 0.30611866, 0.13061303, 0.30611866, 0.30611866,
       0.13061303, 0.30611866, 0.13061303, 0.30611866, 0.30611866,
       0.13061303, 0.13061303, 0.13061303, 0.13061303, 0.30611866,
       0.13061303, 0.13061303, 0.13061303, 0.30611866, 0.30611866,
       0.13061303, 0.13061303, 0.13061303, 0.13061303, 0.30611866,
       0.13061303, 0.30611866, 0.30611866, 0.30611866, 0.13061

In [12]:
rf_test = np.array(rf_test_pred)


In [18]:
rf_test = rf_test.reshape((-1,))

In [14]:
test_pred.shape

(3, 588)

In [15]:
train_set.shape

(1176, 3)

In [16]:
y_train.shape

(1176,)

In [20]:
rf_test_pred

Unnamed: 0,0
0,0.454545
1,0.454545
2,0.454545
3,0.454545
4,0.454545
...,...
289,0.454545
290,0.454545
291,0.454545
292,0.454545


In [7]:
test_set

Unnamed: 0,0,0.1,0.2
0,0.000000,0.000000,0.333333
1,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000
3,0.000000,0.166667,0.333333
4,0.833333,0.833333,0.833333
...,...,...,...
289,0.000000,0.000000,0.000000
290,0.000000,0.000000,0.000000
291,0.000000,0.000000,0.000000
292,0.000000,0.000000,0.000000


In [19]:
train_set

Unnamed: 0,0,0.1,0.2
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,1,1,0
...,...,...,...
1171,0,0,1
1172,0,1,0
1173,0,0,0
1174,0,1,0
