## 训练模型
### synthetic data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sksurv.ensemble import RandomSurvivalForest
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, precision_recall_curve, average_precision_score
from sksurv.metrics import concordance_index_censored
from sklearn.metrics import roc_curve
from imblearn.over_sampling import SMOTE

# 数据加载和预处理
data = pd.read_csv('synthetic_data.csv', index_col=0)

le = LabelEncoder()
categorical_columns = data.select_dtypes(include=['object']).columns

for col in categorical_columns:
    data[col] = le.fit_transform(data[col])

X = data.drop(['Label', 'Employee code/number'], axis=1)
y = data['Label']
time = data['Years in current role']

# 修改标签为1年内是否离职
y_1year = (time <= 1) & y

y_surv = np.array([(bool(y_i), min(t_i, 1.0)) for y_i, t_i in zip(y, time)], 
                  dtype=[('Label', bool), ('time', float)])

# 分割数据
X_train, X_test, y_train, y_test, y_surv_train, y_surv_test = train_test_split(
    X, y_1year, y_surv, test_size=0.2, random_state=42)

# 特征选择 (仅在训练集上进行)
selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median')
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# 获取被选中的特征名称
selected_feature_names = X.columns[selector.get_support()].tolist()

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# 处理类别不平衡
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# 重新创建生存数据
y_surv_train_resampled = np.array([(bool(y_i), 1.0 if y_i else 0.0) for y_i in y_train_resampled], 
                                  dtype=[('Label', bool), ('time', float)])

# 训练RSF模型
rsf = RandomSurvivalForest(n_estimators=50, max_depth=5, min_samples_leaf=10, random_state=42)
rsf.fit(X_train_resampled, y_surv_train_resampled)

# 预测1年生存概率
def predict_1year_survival(rsf, X):
    surv_funcs = rsf.predict_survival_function(X)
    surv_probs_1year = [sf(1.0) for sf in surv_funcs]
    return np.array(surv_probs_1year)

surv_prob_train_1year = predict_1year_survival(rsf, X_train_resampled)
surv_prob_test_1year = predict_1year_survival(rsf, X_test_scaled)

# 将1年生存概率添加到特征中
X_train_with_surv = np.column_stack((X_train_resampled, surv_prob_train_1year))
X_test_with_surv = np.column_stack((X_test_scaled, surv_prob_test_1year))


# 训练RF分类器
rf = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=10, random_state=42)

# 使用分层交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf, X_train_with_surv, y_train_resampled, cv=cv)
print("Cross-validation scores:", cv_scores)
print(f"Mean CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# 在整个训练集上训练最终模型
rf.fit(X_train_with_surv, y_train_resampled)

# 在测试集上评估
y_pred = rf.predict(X_test_with_surv)
y_pred_proba = rf.predict_proba(X_test_with_surv)[:, 1]

print("\n随机森林分类器评估（1年内离职预测）：")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, y_pred_proba))
print("Average Precision Score:", average_precision_score(y_test, y_pred_proba))
print("\n分类报告:")
print(classification_report(y_test, y_pred))

# 绘制PR曲线
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
plt.figure()
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

# 计算最佳阈值
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"\n最佳阈值: {optimal_threshold:.4f}")

# 使用新阈值进行预测
y_pred_new = (y_pred_proba >= optimal_threshold).astype(int)

print("\n使用最佳阈值的随机森林分类器评估（1年内离职预测）：")
print("Accuracy:", accuracy_score(y_test, y_pred_new))
print("AUC Score:", roc_auc_score(y_test, y_pred_proba))
print("\n分类报告:")
print(classification_report(y_test, y_pred_new))

# 评估随机生存森林
c_index = concordance_index_censored(y_surv_test['Label'], y_surv_test['time'], -rsf.predict(X_test_scaled))
print("\n随机生存森林评估：")
print(f"C-index: {c_index[0]:.4f}")

# 特征重要性分析
feature_importance = rf.feature_importances_
feature_names = selected_feature_names + ['Survival Probability']
for name, importance in zip(feature_names, feature_importance):
    print(f"{name}: {importance:.4f}")

# 学习曲线分析
train_sizes, train_scores, test_scores = learning_curve(
    rf, X_train_with_surv, y_train_resampled, cv=5,
    train_sizes=np.linspace(0.1, 1.0, 5))

plt.figure()
plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training score')
plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Cross-validation score')
plt.xlabel('Training examples')
plt.ylabel('Score')
plt.title('Learning Curve')
plt.legend()
plt.show()

# 创建结果数据框
results = pd.DataFrame({
    'Predicted_Label': y_pred_new,
    'Turnover_Probability_1Year': y_pred_proba
})

# 添加原始特征
for i, name in enumerate(selected_feature_names):
    results[name] = X_test_selected[:, i]

# 为host提供建议
def provide_recommendation(prob, threshold):
    if prob > threshold * 1.5:  # 高于阈值50%
        return f"High risk: There is a {prob:.1%} probability that this employee will leave within the next year. It is recommended to take retention measures immediately."
    elif prob >= threshold:
        return f"Medium risk: The employee has a {prob:.1%} probability of leaving within the next year. It is recommended to closely monitor and consider taking preventive measures."
    else:
        return f"Low risk: This employee has a {prob:.1%} probability of leaving within the next year. Currently, the risk is low, but regular monitoring is still necessary."

# 为每个员工生成建议
results['Recommendation'] = results.apply(lambda row: provide_recommendation(row['Turnover_Probability_1Year'], optimal_threshold), axis=1)

ImportError: cannot import name '_print_elapsed_time' from 'sklearn.utils' (/Users/pengjiabeitang/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/__init__.py)

In [None]:
# print("\n部分预测结果:")
# print(results.head(10))

# # 分析预测的生存时间分布
# print("\n预测生存时间的统计信息：")
# print(results['Turnover_Probability_1Year'].describe())

# 显示所有 Predicted_Label 为 1 的行
print("\n所有预测为离职的员工：")
predicted_turnover = results[results['Predicted_Label'] == 1]
print(predicted_turnover[['Predicted_Label', 'Turnover_Probability_1Year', 'Recommendation']])

In [None]:
# 特征重要性分析
feature_importance = rf.feature_importances_
importance_df = pd.DataFrame({
    'feature': selected_feature_names + ['Survival Probability'],
    'importance': feature_importance
})
importance_df = importance_df.sort_values('importance', ascending=False)
print("\n特征重要性：")
print(importance_df)

### kaggle data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, learning_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sksurv.ensemble import RandomSurvivalForest
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, precision_recall_curve, average_precision_score
from sksurv.metrics import concordance_index_censored
from sklearn.metrics import roc_curve
from imblearn.over_sampling import SMOTE

# 数据加载和预处理
data = pd.read_csv('Cleaned_Data.csv', index_col=0)


le = LabelEncoder()
categorical_columns = data.select_dtypes(include=['object']).columns

for col in categorical_columns:
    data[col] = le.fit_transform(data[col])

X = data.drop(['Label'], axis=1)
y = data['Label']
time = data['YearsAtCompany']

# 修改标签为1年内是否离职
y_1year = (time <= 1) & y

y_surv = np.array([(bool(y_i), min(t_i, 1.0)) for y_i, t_i in zip(y, time)], 
                  dtype=[('Label', bool), ('time', float)])

# 分割数据
X_train, X_test, y_train, y_test, y_surv_train, y_surv_test = train_test_split(
    X, y_1year, y_surv, test_size=0.2, random_state=42)

# 特征选择 (仅在训练集上进行)
selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median')
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# 获取被选中的特征名称
selected_feature_names = X.columns[selector.get_support()].tolist()

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# 处理类别不平衡
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# 重新创建生存数据
y_surv_train_resampled = np.array([(bool(y_i), 1.0 if y_i else 0.0) for y_i in y_train_resampled], 
                                  dtype=[('Label', bool), ('time', float)])

# 训练RSF模型
rsf = RandomSurvivalForest(n_estimators=50, max_depth=5, min_samples_leaf=10, random_state=42)
rsf.fit(X_train_resampled, y_surv_train_resampled)

# 预测1年生存概率
def predict_1year_survival(rsf, X):
    surv_funcs = rsf.predict_survival_function(X)
    surv_probs_1year = [sf(1.0) for sf in surv_funcs]
    return np.array(surv_probs_1year)

surv_prob_train_1year = predict_1year_survival(rsf, X_train_resampled)
surv_prob_test_1year = predict_1year_survival(rsf, X_test_scaled)

# 将1年生存概率添加到特征中
X_train_with_surv = np.column_stack((X_train_resampled, surv_prob_train_1year))
X_test_with_surv = np.column_stack((X_test_scaled, surv_prob_test_1year))


# 训练RF分类器
rf = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=10, random_state=42)

# 使用分层交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf, X_train_with_surv, y_train_resampled, cv=cv)
print("Cross-validation scores:", cv_scores)
print(f"Mean CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# 在整个训练集上训练最终模型
rf.fit(X_train_with_surv, y_train_resampled)

# 在测试集上评估
y_pred = rf.predict(X_test_with_surv)
y_pred_proba = rf.predict_proba(X_test_with_surv)[:, 1]

print("\n随机森林分类器评估（1年内离职预测）：")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, y_pred_proba))
print("Average Precision Score:", average_precision_score(y_test, y_pred_proba))
print("\n分类报告:")
print(classification_report(y_test, y_pred))

# 计算最佳阈值
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"\n最佳阈值: {optimal_threshold:.4f}")

# 使用新阈值进行预测
y_pred_new = (y_pred_proba >= optimal_threshold).astype(int)

print("\n使用最佳阈值的随机森林分类器评估（1年内离职预测）：")
print("Accuracy:", accuracy_score(y_test, y_pred_new))
print("AUC Score:", roc_auc_score(y_test, y_pred_proba))
print("\n分类报告:")
print(classification_report(y_test, y_pred_new))

# 评估随机生存森林
c_index = concordance_index_censored(y_surv_test['Label'], y_surv_test['time'], -rsf.predict(X_test_scaled))
print("\n随机生存森林评估：")
print(f"C-index: {c_index[0]:.4f}")

# 特征重要性分析
feature_importance = rf.feature_importances_
feature_names = selected_feature_names + ['Survival Probability']
for name, importance in zip(feature_names, feature_importance):
    print(f"{name}: {importance:.4f}")

# # 学习曲线分析
# train_sizes, train_scores, test_scores = learning_curve(
#     rf, X_train_with_surv, y_train_resampled, cv=5,
#     train_sizes=np.linspace(0.1, 1.0, 5))

# plt.figure()
# plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training score')
# plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Cross-validation score')
# plt.xlabel('Training examples')
# plt.ylabel('Score')
# plt.title('Learning Curve')
# plt.legend()
# plt.show()

# # 创建结果数据框
# results = pd.DataFrame({
#     'Predicted_Label': y_pred_new,
#     'Turnover_Probability_1Year': y_pred_proba
# })

# # 添加原始特征
# for i, name in enumerate(selected_feature_names):
#     results[name] = X_test_selected[:, i]

# # 为host提供建议
# def provide_recommendation(prob, threshold):
#     if prob > threshold * 1.5:  # 高于阈值50%
#         return f"High risk: There is a {prob:.1%} probability that this employee will leave within the next year. It is recommended to take retention measures immediately."
#     elif prob >= threshold:
#         return f"Medium risk: The employee has a {prob:.1%} probability of leaving within the next year. It is recommended to closely monitor and consider taking preventive measures."
#     else:
#         return f"Low risk: This employee has a {prob:.1%} probability of leaving within the next year. Currently, the risk is low, but regular monitoring is still necessary."

# # 为每个员工生成建议
# results['Recommendation'] = results.apply(lambda row: provide_recommendation(row['Turnover_Probability_1Year'], optimal_threshold), axis=1)

## 保存模型

In [None]:
import joblib

# 保存模型和相关对象
joblib.dump(selector, 'RFRSFmodel/feature_selector.joblib')
joblib.dump(scaler, 'RFRSFmodel/scaler.joblib')
joblib.dump(rf, 'RFRSFmodel/random_forest_classifier.joblib')
joblib.dump(rsf, 'RFRSFmodel/random_survival_forest.joblib')
joblib.dump(le, 'RFRSFmodel/label_encoder.joblib')
print(le.classes_)
# 保存最佳阈值
np.save('RFRSFmodel/optimal_threshold.npy', optimal_threshold)

# 保存选中的特征名称
with open('RFRSFmodel/selected_features.txt', 'w') as f:
    for feature in selected_feature_names:
        f.write(f"{feature}\n")

print("模型和相关对象已保存。")

## 测试模型

In [14]:
def predict_employee_turnover(employee_data):
    # 加载模型和相关对象
    selector = joblib.load('RFRSFmodel/feature_selector.joblib')
    scaler = joblib.load('RFRSFmodel/scaler.joblib')
    rf = joblib.load('RFRSFmodel/random_forest_classifier.joblib')
    rsf = joblib.load('RFRSFmodel/random_survival_forest.joblib')
    optimal_threshold = np.load('RFRSFmodel/optimal_threshold.npy')
    
    # 读取选中的特征名称
    with open('RFRSFmodel/selected_features.txt', 'r') as f:
        selected_features = [line.strip() for line in f]
    
    # 加载 LabelEncoder
    le = joblib.load('RFRSFmodel/label_encoder.joblib')
    
    print("Expected features:", selected_features)
    print("Actual features:", employee_data.columns.tolist())
    
    # 只选择需要的特征
    X = employee_data[selected_features].copy()
    
    # 对分类特征进行编码
    categorical_columns = X.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        if col in selected_features:
            if set(X[col].unique()) - set(le.classes_):
                print(f"Warning: New categories found in {col}. Treating them as the most frequent category.")
                X[col] = X[col].map(lambda x: x if x in le.classes_ else le.classes_[0])
            X[col] = le.transform(X[col])
    
    # 标准化特征
    X_scaled = scaler.transform(X)
    
    # 预测生存概率
    surv_prob = rsf.predict(X_scaled)
    
    # 将生存概率添加到特征中
    X_with_surv = np.column_stack((X_scaled, surv_prob))
    
    # 预测离职概率
    turnover_prob = rf.predict_proba(X_with_surv)[:, 1]
    
    # 预测标签
    predicted_label = (turnover_prob >= optimal_threshold).astype(int)
    
    # 估计生存时间
    survival_times = estimate_survival_time(rsf, X_scaled)
    
    # 生成建议
    recommendations = [provide_recommendation(prob, time, optimal_threshold) 
                       for prob, time in zip(turnover_prob, survival_times)]
    
    # 创建结果DataFrame
    results = pd.DataFrame({
        'Predicted_Label': predicted_label,
        'Turnover_Probability': turnover_prob,
        'Estimated_Survival_Time': survival_times,
        'Recommendation': recommendations
    })
    
    return results

In [6]:
# import pandas as pd

# # 读取数据
# df = pd.read_csv('synthetic_data.csv', index_col=0)

# # 分别选择label == 0 和 label == 1的25个样本
# df_label_0 = df[df['Label'] == 0].sample(25, random_state=42)
# df_label_1 = df[df['Label'] == 1].sample(25, random_state=42)

# # 合并数据
# df_test = pd.concat([df_label_0, df_label_1])

# # 保存为新的CSV文件
# df_test.to_csv('test_data.csv', index=False)
# print("已成功导出50个样本到test_data.csv文件中。")

In [None]:
# 主脚本
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sksurv.ensemble import RandomSurvivalForest
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sksurv.metrics import concordance_index_censored
from sklearn.metrics import roc_curve
import joblib

# 加载新的员工数据
new_data = pd.read_csv('test_data.csv')

# 读取选中的特征名称
with open('RFRSFmodel/selected_features.txt', 'r') as f:
    selected_features = [line.strip() for line in f]

# 检查是否所有需要的特征都存在
missing_features = set(selected_features) - set(new_data.columns)
if missing_features:
    raise ValueError(f"The following features are missing in the test data: {missing_features}")

# 使用模型进行预测
predictions = predict_employee_turnover(new_data)

# 打印结果
print(predictions)

# # 可选：保存结果到CSV文件
# predictions.to_csv('employee_turnover_predictions.csv', index=False)
# print("预测结果已保存到 'employee_turnover_predictions.csv'")