In [9]:
import platform
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# 设置随机种子和中文显示
np.random.seed(42)
# 设置中文显示
system = platform.system()

if system == 'Darwin':  # Mac系统
    plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
elif system == 'Windows':  # Windows系统
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 黑体
elif system == 'Linux':  # Linux系统
    # Linux系统可能需要安装中文字体，例如 Noto Sans CJK SC
    plt.rcParams['font.sans-serif'] = ['Noto Sans CJK SC']
else:
    # 默认字体（如果系统未识别）
    plt.rcParams['font.sans-serif'] = ['SimHei']

# 读取数据
df = pd.read_csv("./student_data.csv")

# 初始化模型存储结构
# 在代码开头初始化完整的字典结构
grade_models = {
    'pca_model': {},
    'bayes_models': {},
    'random_forest_models': {}
}

# 系别映射（如果需要）
mapping = {1: 'A', 2: 'B', 3: 'C', 4: 'D'}
if df['Programme'].dtype == 'int64' or df['Programme'].iloc[0] in [1, 2, 3, 4]:
    df['Programme'] = df['Programme'].map(mapping)


# 使用混合模型提高预测准确率
## 随机森林混合模型
该模型分类精度很高，但是需要数据集较大，且数据集中的每个类别的样本量要均匀分布。此数据集中C Programme的样本量较少，可能会影响模型的准确率。并且grade 3数量也很少，并且绝大多数以C programme为主。

## knn分类模型
该模型分类精度较差，但是对于特殊类别，比如说C Programme和grade 3的样本量较少的类别，knn模型的分类精度较高。我们将使用knn模型进行分类。我们将使用投票模型的方式为数据集中数量较少的特殊组创建pca+kmean模型进行分类。接下来我们将要训练knn模型。

In [10]:
# ============ PCA模型训练 ============
print("\n开始训练PCA全局模型...")

# 提取特征和目标变量用于PCA模型
X_pca = df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Grade', 'Gender']]  # 确保只使用这些特征
y_pca = df['Programme']

# PCA + 随机森林管道
pca_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# 定义参数网格
pca_param_grid = {
    'pca__n_components': [2, 3, 4, 5],
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 15]
}

# 使用网格搜索查找最佳参数
pca_grid_search = GridSearchCV(pca_pipeline, pca_param_grid, cv=5, scoring='accuracy')
pca_grid_search.fit(X_pca, y_pca)
print(f"PCA模型最佳参数: {pca_grid_search.best_params_}")
print(f"PCA模型交叉验证分数: {pca_grid_search.best_score_:.4f}")

# 获取最佳PCA模型
pca_best_model = pca_grid_search.best_estimator_

# 将PCA模型添加到grade_models字典中
# 保存最佳模型
grade_models['pca_model']['model'] = {
    'model': pca_best_model,
    'features': X_pca.columns.tolist(),
    'accuracy': pca_grid_search.best_score_,
    'model_type': 'pca'
}

print("PCA模型已添加到grade_models字典")


开始训练PCA全局模型...
PCA模型最佳参数: {'classifier__max_depth': 15, 'classifier__n_estimators': 100, 'pca__n_components': 5}
PCA模型交叉验证分数: 0.5559
PCA模型已添加到grade_models字典


接下来训练贝叶斯模型

In [11]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
from sklearn.mixture import GaussianMixture
from scipy import stats

# ============ 定义高级特征工程函数 ============
def create_advanced_features(df_input):
    eps = 0.0000001
    # 基础特征
    X = df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Gender']].copy()

    # 题目得分比例特征
    for i in range(1, 6):
        for j in range(i + 1, 6):
            X[f'Q{i}_to_Q{j}'] = df_input[f'Q{i}'] / (df_input[f'Q{j}'] + eps)

    # 统计特征
    X['mean_score'] = df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].mean(axis=1)
    X['std_score'] = df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].std(axis=1)
    X['range_score'] = df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].max(axis=1) - df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].min(axis=1)
    X['cv_score'] = X['std_score'] / (X['mean_score'] + eps)  # 变异系数

    # 归一化题目分数
    max_scores = [8, 8, 14, 10, 6]  # 每题满分
    for i in range(1, 6):
        X[f'Q{i}_norm'] = df_input[f'Q{i}'] / max_scores[i-1]

    return X




class FlexibleBayesClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, feature_distribution_map=None, random_state=42):
        """
        初始化灵活贝叶斯分类器

        参数:
            feature_distribution_map: 字典，指定每个(类别,特征)对应的分布类型
                例如 {('A', 'Q1'): 'bimodal_gaussian', ('A', 'Q2'): 'log_normal'}
            random_state: 随机种子
        """
        self.feature_distribution_map = feature_distribution_map or {}
        self.random_state = random_state
        self.models = {}  # 存储每个类别每个特征的分布模型
        self.priors = {}  # 存储每个类别的先验概率
        self.classes_ = None
        self.features_ = None

    def _fit_distribution(self, X, feature_idx, feature_name, distribution_type):
        """为特定特征拟合指定的分布类型"""
        data = X[:, feature_idx]

        if distribution_type == 'bimodal_gaussian':
            model = GaussianMixture(
                n_components=2,
                covariance_type='full',
                random_state=self.random_state
            )
            # 重塑数据为2D数组(GMM需要)
            model.fit(data.reshape(-1, 1))
            return model

        elif distribution_type == 'log_normal':
            # 对于对数正态分布，我们需要确保数据为正
            pos_data = np.maximum(data, 1e-10)
            params = stats.lognorm.fit(pos_data)
            return {'distribution': 'log_normal', 'params': params}

        else:
            # 默认使用双峰高斯
            model = GaussianMixture(
                n_components=2,
                covariance_type='full',
                random_state=self.random_state
            )
            model.fit(data.reshape(-1, 1))
            return model

    def fit(self, X, y):
        """
        训练贝叶斯分类器

        参数:
            X: 特征矩阵
            y: 目标类别
        """
        self.classes_ = np.unique(y)
        n_samples, n_features = X.shape
        self.features_ = np.arange(n_features)

        # 计算先验概率
        for cls in self.classes_:
            self.priors[cls] = np.mean(y == cls)

        # 为每个类别的每个特征拟合分布
        for cls in self.classes_:
            cls_samples = X[y == cls]
            if len(cls_samples) < 5:  # 样本太少，跳过
                continue

            self.models[cls] = {}

            for i, feature_idx in enumerate(self.features_):
                feature_name = f"feature_{i}"  # 默认特征名

                # 确定该(类别,特征)对应的分布类型
                dist_type = self.feature_distribution_map.get((cls, feature_name), 'bimodal_gaussian')

                # 拟合分布
                self.models[cls][feature_idx] = self._fit_distribution(
                    cls_samples, feature_idx, feature_name, dist_type
                )

        return self

    def _score_sample(self, x, cls):
        """计算样本在给定类别下的对数似然"""
        log_likelihood = 0.0

        for feature_idx in self.features_:
            if cls not in self.models or feature_idx not in self.models[cls]:
                continue

            model = self.models[cls][feature_idx]
            value = x[feature_idx]

            # 根据分布类型计算似然
            if isinstance(model, GaussianMixture):  # 双峰高斯
                # GMM需要2D数据
                ll = model.score_samples(np.array([[value]]))
                log_likelihood += ll[0]

            elif isinstance(model, dict) and model['distribution'] == 'log_normal':
                # 对��正态分布
                params = model['params']
                if value <= 0:  # 对数正态要求值为正
                    log_likelihood += np.log(1e-10)  # 极小概率
                else:
                    ll = stats.lognorm.logpdf(value, *params)
                    log_likelihood += ll

            else:  # 默认双峰高斯
                ll = 0  # 这里应该不会执行到，但为了安全起见
                log_likelihood += ll

        return log_likelihood

    def predict_proba(self, X):
        """
        预测每个样本属于各个类别的概率

        参数:
            X: 特征矩阵

        返回:
            概率矩阵，形状为(n_samples, n_classes)
        """
        if not self.models:
            raise ValueError("模型尚未训练")

        n_samples = X.shape[0]
        n_classes = len(self.classes_)

        # 初始化对数似然矩阵
        log_likelihoods = np.zeros((n_samples, n_classes))

        # 计算每个样本在每个类别下的对数似然
        for i, x in enumerate(X):
            for j, cls in enumerate(self.classes_):
                if cls in self.models:
                    # 计算对数似然
                    log_likelihood = self._score_sample(x, cls)
                    # 贝叶斯公式: P(C|X) ∝ P(X|C) * P(C)
                    log_likelihoods[i, j] = log_likelihood + np.log(self.priors[cls])
                else:
                    log_likelihoods[i, j] = -np.inf

        # 处理数值问题：从每一行减去最大值以防溢出
        max_log_probs = np.max(log_likelihoods, axis=1, keepdims=True)
        log_probs_stable = log_likelihoods - max_log_probs

        # 取指数并归一化
        probs = np.exp(log_probs_stable)
        row_sums = np.sum(probs, axis=1, keepdims=True)

        # 处理零概率行
        zero_rows = (row_sums == 0).ravel()
        if np.any(zero_rows):
            probs[zero_rows, :] = 1.0 / n_classes
            row_sums[zero_rows] = 1.0

        # 归一化概率
        probs = probs / row_sums

        # 检查并处理NaN
        if np.any(np.isnan(probs)):
            nan_rows = np.any(np.isnan(probs), axis=1)
            probs[nan_rows, :] = 1.0 / n_classes

        return probs

    def predict(self, X):
        """预测类别"""
        probs = self.predict_proba(X)
        return self.classes_[np.argmax(probs, axis=1)]

# 按年级分别训练模型
for grade in df['Grade'].unique():
    print(f"\n开始训练年级 {grade} 的模型")
    grade_df = df[df['Grade'] == grade]

    X_grade = create_advanced_features(grade_df)
    y_grade = grade_df['Programme']

    # 检查样本量
    programme_counts = y_grade.value_counts()
    valid_programmes = programme_counts[programme_counts >= 5].index
    if len(valid_programmes) < 2:
        print(f"年级 {grade} 的有效系别数量不足，跳过")
        continue

    # 过滤有效系别
    mask = y_grade.isin(valid_programmes)
    X_grade = X_grade[mask]
    y_grade = y_grade[mask]

    # 为每个(系别,特征)组合确定最佳分布类型
    feature_distribution_map = {}

    # 这里可以根据你之前的分析结果填充feature_distribution_map
    # 例如:
    if grade == 3 and 'C' in valid_programmes:
        feature_distribution_map[('C', 'Q1')] = 'bimodal_gaussian'
        feature_distribution_map[('C', 'Q2')] = 'bimodal_gaussian'
        feature_distribution_map[('C', 'Q3')] = 'log_normal'  # 右偏双峰对数正态
        feature_distribution_map[('C', 'Q4')] = 'bimodal_gaussian'
        feature_distribution_map[('C', 'Q5')] = 'bimodal_gaussian'

    if grade == 2 and 'B' in valid_programmes:
        feature_distribution_map[('B', 'Total')] = 'log_normal'  # 右偏双峰对数正态

    # 交叉验证
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # 创建灵活贝叶斯模型
    bayes_model = FlexibleBayesClassifier(
        feature_distribution_map=feature_distribution_map,
        random_state=42
    )

    # 使用Pipeline进行特征标准化和模型训练
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', bayes_model)
    ])

    # 训练模型
    pipeline.fit(X_grade, y_grade)

    # 进���交叉验证评估
    cv_scores = []

    for train_idx, test_idx in cv.split(X_grade, y_grade):
        X_train, X_test = X_grade.iloc[train_idx], X_grade.iloc[test_idx]
        y_train, y_test = y_grade.iloc[train_idx], y_grade.iloc[test_idx]

        # 训练模型
        pipeline.fit(X_train, y_train)

        # 预测并评估
        y_pred = pipeline.predict(X_test)
        accuracy = np.mean(y_pred == y_test)
        cv_scores.append(accuracy)

    # 保存最佳模型
    grade_models["bayes_models"][grade] = {
        'model': pipeline,
        'features': X_grade.columns,
        'accuracy': np.mean(cv_scores),
        'feature_distribution_map': feature_distribution_map
    }

    print(f"年级 {grade} 模型训练完成，交叉验证平均准确率: {np.mean(cv_scores):.4f}")


开始训练年级 3 的模型
年级 3 的有效系别数量不足，跳过

开始训练年级 2 的模型
年级 2 模型训练完成，交叉验证平均准确率: 0.5057


接下来训练效果在大数据集上更好的随机森林模型，

In [12]:

# ============ 随机森林模型（按年级训练） ============
print("\n开始按年级训练随机森林模型...")



# 定义高级特征工程函数
def create_advanced_features(df_input):
    eps = 0.0000001
    # 基础特征
    X = df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Gender']].copy()

    # 题目得分比例特征
    for i in range(1, 6):
        for j in range(i + 1, 6):
            X[f'Q{i}_to_Q{j}'] = df_input[f'Q{i}'] / (df_input[f'Q{j}'] + eps)

    # 统计特征
    X['mean_score'] = df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].mean(axis=1)
    X['std_score'] = df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].std(axis=1)
    X['range_score'] = df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].max(axis=1) - df_input[
        ['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].min(axis=1)
    X['cv_score'] = X['std_score'] / (X['mean_score'] + eps)  # 变异系数

    # 归一化题目分数
    max_scores = [8, 8, 14, 10, 6]  # 每题满分
    for i in range(1, 6):
        X[f'Q{i}_norm'] = df_input[f'Q{i}'] / max_scores[i - 1]

    return X


# 按年级分别训练模型
for grade in df['Grade'].unique():
    print(f"开始训练年级 {grade} 的模型")
    grade_df = df[df['Grade'] == grade]

    X_grade = create_advanced_features(grade_df)
    y_grade = grade_df['Programme']

    # 检查样本量
    programme_counts = y_grade.value_counts()
    valid_programmes = programme_counts[programme_counts >= 5].index
    if len(valid_programmes) < 2:
        print(f"年级 {grade} 的有效系别数量不足，将使用PCA全局模型")
        continue

    # 过滤有效系别
    mask = y_grade.isin(valid_programmes)
    X_grade = X_grade[mask]
    y_grade = y_grade[mask]

    # 交叉验证
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # 创建模型
    rf = RandomForestClassifier(random_state=42)

    # 超参数网格
    rf_param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 15],
        'min_samples_split': [2, 5]
    }

    # 网格搜索
    grid_rf = GridSearchCV(rf, rf_param_grid, cv=cv, scoring='accuracy')
    grid_rf.fit(X_grade, y_grade)

    # 保存最佳模型
    grade_models["random_forest_models"][grade] = {
        'model': grid_rf.best_estimator_,
        'features': list(X_grade.columns),
        'accuracy': grid_rf.best_score_,
        'best_params': grid_rf.best_params_,
        'valid_programmes': list(valid_programmes)
    }

    print(f"年级 {grade} 模型准确率: {grid_rf.best_score_:.4f}")



开始按年级训练随机森林模型...
开始训练年级 3 的模型
年级 3 的有效系别数量不足，将使用PCA全局模型
开始训练年级 2 的模型
年级 2 模型准确率: 0.5357


融合模型，仅在出现KNN预测出C programme时允许使用knn模型进行分类

In [13]:
# =========== 融合预测函数 ===========
def hybrid_predict(scores, grade, gender):
    """
    结合随机森林、贝叶斯和PCA模型进行预测

    参数:
        scores: 学生的各科成绩
        grade: 学生年级
        gender: 学生性别

    返回:
        预测的系别
        各系别概率字典
        使用的模型类型 ('bayes', 'rf', 'pca', 'ensemble')
    """
    rf_probs = {}
    bayes_probs = {}

    # 创建基础特征字典
    eps = 0.0000001
    feature_dict = {
        'Q1': scores[0], 'Q2': scores[1], 'Q3': scores[2],
        'Q4': scores[3], 'Q5': scores[4], 'Gender': gender
    }

    # 计算比例特征
    for i in range(1, 6):
        for j in range(i + 1, 6):
            feature_dict[f'Q{i}_to_Q{j}'] = scores[i - 1] / (scores[j - 1] + eps)

    # 计算统计特征
    feature_dict['mean_score'] = np.mean(scores)
    feature_dict['std_score'] = np.std(scores)
    feature_dict['range_score'] = max(scores) - min(scores)
    feature_dict['cv_score'] = feature_dict['std_score'] / (feature_dict['mean_score'] + eps)

    # 归一化分数
    max_scores = [8, 8, 14, 10, 6]
    for i in range(1, 6):
        feature_dict[f'Q{i}_norm'] = scores[i - 1] / max_scores[i - 1]

    # 创建完整特征DataFrame
    X_full = pd.DataFrame([feature_dict])

    # 1. 尝试使用随机森林模型
    if grade in grade_models['random_forest_models']:
        model_info = grade_models['random_forest_models'][grade]
        rf_model = model_info['model']

        # 选择模型需要的特征列
        feature_cols = model_info['features']
        # 确保所有需要的特征列都存在
        missing_cols = [col for col in feature_cols if col not in X_full.columns]
        if missing_cols:
            print(f"警告：随机森林模型缺少特征列: {missing_cols}")
        else:
            X_input = X_full[feature_cols]
            # 预测概率
            probs_array = rf_model.predict_proba(X_input)
            classes = rf_model.classes_
            rf_probs = {cls: prob for cls, prob in zip(classes, probs_array[0])}

    # 2. 尝试使用贝叶斯模型
    if grade in grade_models['bayes_models']:
        model_info = grade_models['bayes_models'][grade]
        bayes_model = model_info['model']

        # 选择模型需要的特征列
        feature_cols = model_info['features']
        # 确保所有需��的特征列都存在
        missing_cols = [col for col in feature_cols if col not in X_full.columns]
        if missing_cols:
            print(f"警告：贝叶斯模型缺少特征列: {missing_cols}")
        else:
            X_input = X_full[feature_cols]
            # 预测概率
            probs_array = bayes_model.predict_proba(X_input)
            classes = bayes_model.classes_
            bayes_probs = {cls: prob for cls, prob in zip(classes, probs_array[0])}

    # 3. 使用PCA模型进行预测
    pca_model = grade_models['pca_model']['model']['model']
    pca_input = pd.DataFrame([[scores[0], scores[1], scores[2], scores[3], scores[4],
                             grade, gender]],
                           columns=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Grade', 'Gender'])

    pca_probs_array = pca_model.predict_proba(pca_input)
    pca_classes = pca_model.classes_
    pca_probs = {cls: prob for cls, prob in zip(pca_classes, pca_probs_array[0])}

    # 4. 融合模型预测结果
    # 如果只有PCA模型可用
    if not rf_probs and not bayes_probs:
        final_probs = pca_probs
        model_type = 'pca'
    else:
        # 融合多个模型的结果
        final_probs = {}
        all_classes = set()

        # 收集所有可能的类别
        if rf_probs:
            all_classes.update(rf_probs.keys())
        if bayes_probs:
            all_classes.update(bayes_probs.keys())
        all_classes.update(pca_probs.keys())

        # 设置模型权重
        weights = {
            'rf': 0.6,
            'bayes': 0.1,
            'pca': 0.3
        }

        # 对每个类别进行加权融合
        for cls in all_classes:
            weighted_sum = 0
            total_weight = 0

            # 对C类别的特殊处理
            if cls == 'C':
                weights['pca'] = 0.7  # 提高PCA权重
                weights['rf'] = 0.2
                weights['bayes'] = 0.1
            else:
                weights['pca'] = 0.3
                weights['rf'] = 0.6
                weights['bayes'] = 0.1

            # 随机森林概率
            if rf_probs and cls in rf_probs:
                weighted_sum += weights['rf'] * rf_probs[cls]
                total_weight += weights['rf']

            # 贝叶斯概率
            if bayes_probs and cls in bayes_probs:
                weighted_sum += weights['bayes'] * bayes_probs[cls]
                total_weight += weights['bayes']

            # PCA概率
            if cls in pca_probs:
                weighted_sum += weights['pca'] * pca_probs[cls]
                total_weight += weights['pca']

            # 计算最终概率
            if total_weight > 0:
                final_probs[cls] = weighted_sum / total_weight

        model_type = 'ensemble'

    # 获取最大概率的系别
    predicted = max(final_probs, key=final_probs.get)
    return predicted, final_probs, model_type

最后测试模型效果

In [14]:
# =========== 测试各模型效果 ===========

# 读取测试数据
test_df = pd.read_csv("./unique_test_data.csv")

# 系别映射（如果需要）
if test_df['Programme'].dtype == 'int64' or test_df['Programme'].iloc[0] in [1, 2, 3, 4]:
    test_df['Programme'] = test_df['Programme'].map(mapping)

# 提取测试集特征和标签
X_test = test_df[['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Grade', 'Gender']]
y_test = test_df['Programme']

# 1. PCA模型评估
print("\n===== PCA模型测试集结果 =====")
pca_model = grade_models['pca_model']['model']['model']
pca_predictions = pca_model.predict(X_test)
print("混淆矩阵:")
print(confusion_matrix(y_test, pca_predictions))
print("分类报告:")
print(classification_report(y_test, pca_predictions))
print(f"准确率: {accuracy_score(y_test, pca_predictions):.4f}")

# 2. 贝叶斯模型评估
print("\n===== 贝叶斯模型测试结果 =====")
bayes_predictions = []

for _, row in X_test.iterrows():
    grade = row['Grade']
    scores = [row['Q1'], row['Q2'], row['Q3'], row['Q4'], row['Q5']]
    gender = row['Gender']

    # 创建特征
    eps = 0.0000001
    feature_dict = {
        'Q1': scores[0], 'Q2': scores[1], 'Q3': scores[2],
        'Q4': scores[3], 'Q5': scores[4], 'Gender': gender
    }

    # 计算比例特征
    for i in range(1, 6):
        for j in range(i + 1, 6):
            feature_dict[f'Q{i}_to_Q{j}'] = scores[i-1] / (scores[j-1] + eps)

    # 计算统计特征
    feature_dict['mean_score'] = np.mean(scores)
    feature_dict['std_score'] = np.std(scores)
    feature_dict['range_score'] = max(scores) - min(scores)
    feature_dict['cv_score'] = feature_dict['std_score'] / (feature_dict['mean_score'] + eps)

    # 归一化分数
    max_scores = [8, 8, 14, 10, 6]
    for i in range(1, 6):
        feature_dict[f'Q{i}_norm'] = scores[i-1] / max_scores[i-1]

    X_input = pd.DataFrame([feature_dict])

    # 尝试使用贝叶斯模型
    if grade in grade_models['bayes_models']:
        model_info = grade_models['bayes_models'][grade]
        model = model_info['model']

        try:
            X_input = X_input[model_info['features']]
            pred = model.predict(X_input)[0]
        except Exception as e:
            # 使用PCA模型时确保使用正确的DataFrame格式
            pca_input = pd.DataFrame([[scores[0], scores[1], scores[2], scores[3], scores[4], grade, gender]],
                                    columns=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Grade', 'Gender'])
            pred = pca_model.predict(pca_input)[0]
    else:
        # 使用PCA模型时确保使用正确的DataFrame格式
        pca_input = pd.DataFrame([[scores[0], scores[1], scores[2], scores[3], scores[4], grade, gender]],
                                columns=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Grade', 'Gender'])
        pred = pca_model.predict(pca_input)[0]

    bayes_predictions.append(pred)

print("混淆矩阵:")
print(confusion_matrix(y_test, bayes_predictions))
print("分类报告:")
print(classification_report(y_test, bayes_predictions))
print(f"准确率: {accuracy_score(y_test, bayes_predictions):.4f}")

# 3. 随机森林模型评估
print("\n===== 随机森林模型测试结果 =====")
rf_predictions = []

for _, row in X_test.iterrows():
    grade = row['Grade']
    scores = [row['Q1'], row['Q2'], row['Q3'], row['Q4'], row['Q5']]
    gender = row['Gender']

    # 创建特征
    eps = 0.0000001
    feature_dict = {
        'Q1': scores[0], 'Q2': scores[1], 'Q3': scores[2],
        'Q4': scores[3], 'Q5': scores[4], 'Gender': gender
    }

    # 计算比例特征
    for i in range(1, 6):
        for j in range(i + 1, 6):
            feature_dict[f'Q{i}_to_Q{j}'] = scores[i-1] / (scores[j-1] + eps)

    # 计算统计特征
    feature_dict['mean_score'] = np.mean(scores)
    feature_dict['std_score'] = np.std(scores)
    feature_dict['range_score'] = max(scores) - min(scores)
    feature_dict['cv_score'] = feature_dict['std_score'] / (feature_dict['mean_score'] + eps)

    # 归一化分数
    max_scores = [8, 8, 14, 10, 6]
    for i in range(1, 6):
        feature_dict[f'Q{i}_norm'] = scores[i-1] / max_scores[i-1]

    X_input = pd.DataFrame([feature_dict])

    # 尝试使用随机森林模型
    if grade in grade_models['random_forest_models']:
        model_info = grade_models['random_forest_models'][grade]
        model = model_info['model']

        try:
            X_input = X_input[model_info['features']]
            pred = model.predict(X_input)[0]
        except Exception as e:
            # 使用PCA模型时确保使用正确的DataFrame格式
            pca_input = pd.DataFrame([[scores[0], scores[1], scores[2], scores[3], scores[4], grade, gender]],
                                    columns=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Grade', 'Gender'])
            pred = pca_model.predict(pca_input)[0]
    else:
        # 使用PCA模型时确保使用正确的DataFrame格式
        pca_input = pd.DataFrame([[scores[0], scores[1], scores[2], scores[3], scores[4], grade, gender]],
                                columns=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Grade', 'Gender'])
        pred = pca_model.predict(pca_input)[0]

    rf_predictions.append(pred)

print("混淆矩阵:")
print(confusion_matrix(y_test, rf_predictions))
print("分类报告:")
print(classification_report(y_test, rf_predictions))
print(f"准确率: {accuracy_score(y_test, rf_predictions):.4f}")


===== PCA模型测试集结果 =====
混淆矩阵:
[[26  6  0 13]
 [ 9  4  0 15]
 [ 0  0  9  0]
 [14  7  0 28]]
分类报告:
              precision    recall  f1-score   support

           A       0.53      0.58      0.55        45
           B       0.24      0.14      0.18        28
           C       1.00      1.00      1.00         9
           D       0.50      0.57      0.53        49

    accuracy                           0.51       131
   macro avg       0.57      0.57      0.57       131
weighted avg       0.49      0.51      0.50       131

准确率: 0.5115

===== 贝叶斯模型测试结果 =====
混淆矩阵:
[[28  3  0 14]
 [ 7 10  0 11]
 [ 0  0  9  0]
 [11  5  0 33]]
分类报告:
              precision    recall  f1-score   support

           A       0.61      0.62      0.62        45
           B       0.56      0.36      0.43        28
           C       1.00      1.00      1.00         9
           D       0.57      0.67      0.62        49

    accuracy                           0.61       131
   macro avg       0.68      0.66 

这个模型的准确率在60%左右，且对于特殊类别的预测效果较好。我们可以使用这个模型进行预测。

In [15]:
import joblib
import os
import pickle

# 创建模型导出函数
def export_models():
    # 创建存储目录
    os.makedirs('./exported_models', exist_ok=True)



    # 1. 保存PCA模型 - 直接保存模型对象
    joblib.dump(grade_models['pca_model']['model']['model'],
                './exported_models/pca_model.joblib')
    print("PCA模型已保存")

    # 2. 保存随机森林模型
    rf_models = {}
    for grade, model_info in grade_models['random_forest_models'].items():
        rf_models[grade] = {
            'model': model_info['model'],
            'features': model_info['features']
        }
    joblib.dump(rf_models, './exported_models/random_forest_models.joblib')
    print("随机森林模型已保存")

    # 3. 保存贝叶斯模型 - 从Pipeline中提取分类器
    bayes_data = {}
    for grade, model_info in grade_models['bayes_models'].items():
        # 从Pipeline获取真正的分类器
        classifier = model_info['model'].named_steps['classifier']
        bayes_data[grade] = {
            'models': classifier.models,
            'priors': classifier.priors,
            'classes': classifier.classes_,
            'features': model_info['features']
        }
    joblib.dump(bayes_data, './exported_models/bayes_models.joblib')
    print("贝叶斯模型已保存")

    # 导出混合预测函数
    with open('./exported_models/hybrid_predict.py', 'w', encoding='utf-8') as f:
        f.write("""
import numpy as np
import pandas as pd
import joblib
import os
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, RBF
from scipy import stats
from flexBYS import FlexibleBayesClassifier

def load_models(models_dir='../exported_models'):
    models = {}
    models['pca'] = joblib.load(os.path.join(models_dir, 'pca_model.joblib'))
    models['random_forest'] = joblib.load(os.path.join(models_dir, 'random_forest_models.joblib'))
    models['bayes'] = joblib.load(os.path.join(models_dir, 'bayes_models.joblib'))
    return models

def create_advanced_features(df_input):
    eps = 0.0000001
    # 基础特征
    X = df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Gender']].copy()

    # 题目得分比例特征
    for i in range(1, 6):
        for j in range(i + 1, 6):
            X[f'Q{i}_to_Q{j}'] = df_input[f'Q{i}'] / (df_input[f'Q{j}'] + eps)

    # 统计特征
    X['mean_score'] = df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].mean(axis=1)
    X['std_score'] = df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].std(axis=1)
    X['range_score'] = df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].max(axis=1) - df_input[['Q1', 'Q2', 'Q3', 'Q4', 'Q5']].min(axis=1)
    X['cv_score'] = X['std_score'] / (X['mean_score'] + eps)

    # 归一化题目分数
    max_scores = [8, 8, 14, 10, 6]
    for i in range(1, 6):
        X[f'Q{i}_norm'] = df_input[f'Q{i}'] / max_scores[i-1]

    return X

def _score_bayes_sample(x, model_dict, feature_indices):
    \"\"\"计算样本在贝叶斯模型中的对数似然\"\"\"
    log_likelihood = 0.0

    for feature_idx in feature_indices:
        if feature_idx not in model_dict:
            continue

        model = model_dict[feature_idx]
        value = x[feature_idx]

        # 根据分布类型计算似然
        if hasattr(model, 'score_samples'):  # 如果模型有score_samples方法（如GaussianMixture）
            ll = model.score_samples(np.array([[value]]))
            log_likelihood += ll[0]

        elif isinstance(model, dict) and model.get('distribution') == 'log_normal':
            # 对数正态分布
            params = model['params']
            if value <= 0:
                log_likelihood += np.log(1e-10)
            else:
                ll = stats.lognorm.logpdf(value, *params)
                log_likelihood += ll

    return log_likelihood

def hybrid_predict(scores, grade, gender, models=None):
    \"\"\"
    使用混合模型进行预测

    参数:
        scores: 列表，包含5个成绩 [Q1, Q2, Q3, Q4, Q5]
        grade: 整数，年级
        gender: 整数，性别
        models: 可选，已加载的模型字典

    返回:
        predicted: 预测的系别
        probabilities: 各系别概率
        model_info: 模型使用信息
    \"\"\"
    if models is None:
        models = load_models()

    pca_model = models['pca']
    rf_models = models['random_forest']
    bayes_models = models['bayes']

    # 模型预测结果
    predictions = {}

    # 1. PCA模型预测
    pca_input = pd.DataFrame([[scores[0], scores[1], scores[2], scores[3], scores[4], grade, gender]],
                           columns=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Grade', 'Gender'])

    try:
        pca_probs = pca_model.predict_proba(pca_input)[0]
        pca_classes = pca_model.classes_
        pca_probs_dict = {cls: prob for cls, prob in zip(pca_classes, pca_probs)}
        predictions['pca'] = {
            'predicted': pca_model.predict(pca_input)[0],
            'probabilities': pca_probs_dict
        }
    except Exception as e:
        print(f"PCA预测错误: {e}")
        predictions['pca'] = {'predicted': None, 'probabilities': {}}

    # 2. 随机森林模型预测
    if grade in rf_models:
        model_info = rf_models[grade]
        rf_model = model_info['model']

        # 创建特征
        single_df = pd.DataFrame([[scores[0], scores[1], scores[2], scores[3], scores[4], gender]],
                               columns=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Gender'])
        X_features = create_advanced_features(single_df)

        try:
            # 确保特征列顺序一致
            X_input = X_features[model_info['features']]

            rf_probs = rf_model.predict_proba(X_input)[0]
            rf_classes = rf_model.classes_
            rf_probs_dict = {cls: prob for cls, prob in zip(rf_classes, rf_probs)}
            predictions['rf'] = {
                'predicted': rf_model.predict(X_input)[0],
                'probabilities': rf_probs_dict
            }
        except Exception as e:
            print(f"随机森林预测错误: {e}")
            predictions['rf'] = {'predicted': None, 'probabilities': {}}
    else:
        predictions['rf'] = {'predicted': None, 'probabilities': {}}

    # 3. 贝叶斯模型预测
    if grade in bayes_models:
        bayes_model_info = bayes_models[grade]

        try:
            # 创建特征向量
            features_array = np.array(scores + [gender])  # 将���有特征连接成一维数组

            # 获取类别列表
            classes = bayes_model_info.get('classes', [])

            if len(classes)>0:
                # 计算每个类别的对数似然
                log_probs = {}
                for cls in classes:
                    if cls in bayes_model_info['models']:
                        # 获取该类别的模型和先验
                        models_dict = bayes_model_info['models'][cls]
                        prior = bayes_model_info['priors'].get(cls, 1.0/len(classes))

                        # 计算对数似然
                        log_likelihood = _score_bayes_sample(
                            features_array,
                            models_dict,
                            bayes_model_info['features']
                        )

                        # 贝叶斯公式: P(C|X) ∝ P(X|C) * P(C)
                        log_probs[cls] = log_likelihood + np.log(prior)

                # 标准化概率
                if log_probs:
                    max_log_prob = max(log_probs.values())
                    probs = {
                        cls: np.exp(log_prob - max_log_prob)
                        for cls, log_prob in log_probs.items()
                    }

                    # 归一化
                    total = sum(probs.values())
                    if total > 0:
                        probs = {cls: p/total for cls, p in probs.items()}

                        # 最高概率类别
                        predicted = max(probs.items(), key=lambda x: x[1])[0]

                        predictions['bayes'] = {
                            'predicted': predicted,
                            'probabilities': probs
                        }
                    else:
                        predictions['bayes'] = {'predicted': None, 'probabilities': {}}
                else:
                    predictions['bayes'] = {'predicted': None, 'probabilities': {}}
            else:
                predictions['bayes'] = {'predicted': None, 'probabilities': {}}
        except Exception as e:
            print(f"贝叶斯预测错误: {e}")
            predictions['bayes'] = {'predicted': None, 'probabilities': {}}
    else:
        predictions['bayes'] = {'predicted': None, 'probabilities': {}}

    # 融合预测结果
    # 按系别合并概率
    all_programmes = set()
    for model_type in predictions.keys():
        all_programmes.update(predictions[model_type]['probabilities'].keys())

    # 设置模型权重
    weights = {
        'pca': 0.3,
        'rf': 0.6,
        'bayes': 0.1
    }

    # 对C类别的特殊处理
    predictions_contain_c = False
    for model_type in predictions.keys():
        if 'C' in predictions[model_type]['probabilities']:
            c_prob = predictions[model_type]['probabilities']['C']
            if c_prob > 0.4:  # 如果某个模型对C类预测概率较高
                predictions_contain_c = True

    if predictions_contain_c:
        weights['pca'] = 0.7  # 提高PCA模型权重
        weights['rf'] = 0.2
        weights['bayes'] = 0.1

    # 计算加权概率
    final_probs = {}
    for prog in all_programmes:
        weighted_sum = 0
        total_weight = 0

        for model_type, model_result in predictions.items():
            if prog in model_result['probabilities']:
                weighted_sum += model_result['probabilities'][prog] * weights[model_type]
                total_weight += weights[model_type]

        if total_weight > 0:
            final_probs[prog] = weighted_sum / total_weight

    # 获取最高概率的系别
    if final_probs:
        predicted = max(final_probs.items(), key=lambda x: x[1])[0]
    else:
        predicted = None

    return predicted, final_probs, predictions

# 测试函数
# 测试函数
if __name__ == '__main__':
    import os
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

    # 设置中文显示
    plt.rcParams['font.sans-serif'] = ['SimHei']  # Windows黑体
    plt.rcParams['axes.unicode_minus'] = False    # 正常显示负号

    # 加载模型
    models = load_models()

    # 加载测试数据
    test_data_path = '../unique_test_data.csv'
    if not os.path.exists(test_data_path):
        test_data_path = './unique_test_data.csv'

    test_df = pd.read_csv(test_data_path)
    print(f"加载测试数���: {test_data_path}, 样本数: {len(test_df)}")

    # 检查是否需要转换系别编码
    mapping = {'1': 'A', '2': 'B', '3': 'C', '4': 'D'}
    if test_df['Programme'].dtype == 'int64' or str(test_df['Programme'].iloc[0]) in ['1', '2', '3', '4']:
        print("检测到系别使用数字编码，进行转换...")
        test_df['Programme'] = test_df['Programme'].astype(str).map(mapping)
        print(f"系别转换后的分布: \\n{test_df['Programme'].value_counts()}")

    # 初始化结果列表
    results = {
        'actual': [],
        'predicted': [],
        'correct': [],
        'probabilities': []
    }

    # 对测试数据进行预测
    print("\\n开始对测试数据进行预测...")

    for _, student in test_df.iterrows():
        scores = [student['Q1'], student['Q2'], student['Q3'], student['Q4'], student['Q5']]
        grade = student['Grade']
        gender = student['Gender']
        actual_programme = student['Programme']

        # 预测系别
        predicted, probs, _ = hybrid_predict(scores, grade, gender, models)
        is_correct = predicted == actual_programme

        # 保存结果
        results['actual'].append(actual_programme)
        results['predicted'].append(predicted)
        results['correct'].append(is_correct)
        results['probabilities'].append(probs)

    # 计算总体准确率
    correct_count = sum(results['correct'])
    total_count = len(results['correct'])
    overall_accuracy = correct_count / total_count if total_count > 0 else 0
    print(f"\\n总体准确率: {overall_accuracy:.4f} ({correct_count}/{total_count})")

    # 按年级计算准确率
    for grade in sorted(test_df['Grade'].unique()):
        grade_mask = test_df['Grade'] == grade
        grade_indices = test_df[grade_mask].index
        grade_correct = sum([results['correct'][i] for i in range(len(results['correct']))
                          if i in grade_indices])
        grade_total = len(grade_indices)
        grade_accuracy = grade_correct / grade_total if grade_total > 0 else 0
        print(f"年级 {grade} 预测准确率: {grade_accuracy:.4f} ({grade_correct}/{grade_total})")

    # 创建混淆矩阵
    all_programmes = sorted(list(set(results['actual']) | set(results['predicted'])))
    cm = confusion_matrix(results['actual'], results['predicted'], labels=all_programmes)

    # 打印混淆矩阵
    print("\\n混淆矩阵:")
    print(pd.DataFrame(cm, index=all_programmes, columns=all_programmes))

    # 打印分类报告
    print("\\n分类报告:")
    print(classification_report(results['actual'], results['predicted']))

    # 绘制混淆矩阵热图
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=all_programmes,
                yticklabels=all_programmes)
    plt.title('测试集预测混淆矩阵')
    plt.xlabel('预测系别')
    plt.ylabel('实际系别')
    plt.tight_layout()
    plt.savefig('./confusion_matrix.png')
    print("\\n混淆矩阵热图已保存至 './confusion_matrix.png'")

    # 显示图形
    try:
        plt.show()
    except Exception as e:
        print(f"无法显示图形: {e}")
""")

    print("混合预测函数已保存到 './exported_models/hybrid_predict.py'")

    # 导出README说明
    with open('./exported_models/README.md', 'w', encoding='utf-8') as f:
        f.write("""# 专业预测混合模型

## 模型文件说明
- pca_model.joblib: PCA降��+分类器模型
- random_forest_models.joblib: 按年级训练的随机森林模型
- bayes_models.joblib: 按年级训练的贝叶斯模型
- hybrid_predict.py: 模型加载与混合预测功能

## 使用方法
```python
from hybrid_predict import hybrid_predict, load_models

# 加载模型
models = load_models()

# 预测示例
scores = [7, 5, 10, 8, 4]  # 示例成绩 [Q1, Q2, Q3, Q4, Q5]
grade = 2                  # 年级
gender = 1                 # 性别

# 进行预测
predicted, probabilities, model_info = hybrid_predict(scores, grade, gender, models)
print(f"预测系别: {predicted}")
print(f"各系别概率: {probabilities}")""")

export_models()

PCA模型已保存
随机森林模型已保存
贝叶斯模型已保存
混合预测函数已保存到 './exported_models/hybrid_predict.py'
