In [1]:
# ======================================================
# 文件名：Predict_System.ipynb (预测专用系统)
# 作用：加载训练好的模型，预测新员工离职风险
# ======================================================

import pandas as pd
import numpy as np
import os
from catboost import CatBoostClassifier

model_path = r"./save/final_catboost_model.cbm"
new_data_path = "./data/new_candidates.csv" 

# --- (仅作演示) 如果没有新文件，生成一个模拟文件 ---
if not os.path.exists(new_data_path):
    print("未找到输入文件")
# -----------------------------------------------------

# ==========================================
# 2. 加载模型
# ==========================================
if not os.path.exists(model_path):
    raise FileNotFoundError(f"找不到模型文件：{model_path}，请先运行训练脚本！")

model = CatBoostClassifier()
model.load_model(model_path)
print("模型加载成功！准备工作就绪。")

# ==========================================
# 3. 数据加工流水线 
# ==========================================
print("正在处理数据...")
df_raw = pd.read_csv(new_data_path)

# 复制一份用于处理
df_process = df_raw.copy()

# --- A. 复刻特征工程 (核心逻辑) ---
# 必须手动计算这几个衍生特征，因为原始 CSV 里没有
df_process['AvgTenure'] = df_process['TotalWorkingYears'] / (df_process['NumCompaniesWorked'] + 1)
df_process['CareerAgeRatio'] = df_process['TotalWorkingYears'] / df_process['Age']
df_process['TenureRatio'] = df_process['YearsAtCompany'] / (df_process['TotalWorkingYears'] + 1)
df_process['PromotionStagnation'] = df_process['YearsInCurrentRole'] / (df_process['YearsSinceLastPromotion'] + 1)
df_process['IncomePerAge'] = df_process['MonthlyIncome'] / df_process['Age']

# --- B. 独热编码 (One-Hot) ---
df_encoded = pd.get_dummies(df_process, drop_first=True)

# --- C. 列对齐 (关键步骤) ---
# 即使新数据里没有 "OverTime_Yes" 这一列，也要造出来填 0
# 模型记忆的特征顺序
expected_features = model.feature_names_

# 强行对齐
df_final = df_encoded.reindex(columns=expected_features, fill_value=0)

print("数据预处理完成。")

# ==========================================
# 4. 预测与输出
# ==========================================
# 预测离职概率
probs = model.predict_proba(df_final)[:, 1]

# 整理结果
output = df_raw.copy() # 使用原始数据，方便HR看人名
output['离职风险概率'] = np.round(probs, 4) # 保留4位小数
output['风险等级'] = output['离职风险概率'].apply(lambda x: '   高危' if x > 0.5 else ('   中风险' if x > 0.3 else '   稳定'))

# 按风险排序
output = output.sort_values('离职风险概率', ascending=False)

# 保存
output_file = "Final_Prediction_Result.csv"
output.to_csv(output_file, index=False, encoding='utf_8_sig')

print("\n" + "="*40)
print(f"预测完成！结果已保存至: {output_file}")
print("="*40)
print(output[['Age', 'Department', 'OverTime', '离职风险概率', '风险等级']].head())

模型加载成功！准备工作就绪。
正在处理数据...
数据预处理完成。

预测完成！结果已保存至: Final_Prediction_Result.csv
   Age              Department OverTime  离职风险概率    风险等级
0   41                   Sales      Yes  0.9626      高危
2   37  Research & Development      Yes  0.7291      高危
3   33  Research & Development      Yes  0.3486     中风险
4   27  Research & Development       No  0.1287      稳定
6   59  Research & Development      Yes  0.0860      稳定
