# Task 1: Response to Data analysis and feature engineering
使用pandas导入训练数据集，并分析特征和标签的分布情况。

In [None]:
import platform
import math
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# 导入缺失的库
from sklearn.decomposition import FastICA
import seaborn as sns
# 导入聚类相关的库
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
import seaborn as sns
from sklearn.decomposition import FastICA, TruncatedSVD
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings



# 设置随机种子和中文显示
np.random.seed(42)
# 设置中文显示
system = platform.system()

if system == 'Darwin':  # Mac系统
    plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
elif system == 'Windows':  # Windows系统
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 黑体
elif system == 'Linux':  # Linux系统
    # Linux系统可能需要安装中文字体，例如 Noto Sans CJK SC
    plt.rcParams['font.sans-serif'] = ['Noto Sans CJK SC']
else:
    # 默认字体（如果系统未识别）
    plt.rcParams['font.sans-serif'] = ['SimHei']

# 读取数据
df = pd.read_csv("./student_data.csv")

# 初始化模型存储结构
# 在代码开头初始化完整的字典结构
grade_models = {
    'pca_model': {},
    'bayes_models': {},
    'random_forest_models': {}
}

# 系别映射（如果需要）
mapping = {1: 'A', 2: 'B', 3: 'C', 4: 'D'}
if df['Programme'].dtype == 'int64' or df['Programme'].iloc[0] in [1, 2, 3, 4]:
    df['Programme'] = df['Programme'].map(mapping)

# 数据处理和特征提取函数
def process_data(df, mode='train', preprocessors=None):
    """
    处理数据并提取特征，确保训练和测试数据使用相同的预处理

    参数:
    df - 输入数据框
    mode - 'train'或'test'
    preprocessors - 训练模式下创建并返回，测试模式下使用

    返回:
    feature_sets - 特征集字典
    preprocessors - 预处理器字典(仅训练模式)
    """
    # 去除索引列（如果存在）
    if 'Index' in df.columns:
        df = df.drop('Index', axis=1)

    print(f"{mode}数据集形状: {df.shape}")
    print(f"\n{mode}数据集前5行:")
    print(df.head())

    # 只保留数值特征
    numeric_df = df.select_dtypes(include=['float64', 'int64'])

    feature_sets = {}

    if mode == 'train':
        preprocessors = {'scalers': {}}

    # 特征集1：考试分数
    if any('Q' in col for col in numeric_df.columns):
        exam_cols = [col for col in numeric_df.columns if 'Q' in col]
        if mode == 'train':
            preprocessors['exam_cols'] = exam_cols
    else:
        if mode == 'train':
            # 使用最后5列
            exam_cols = numeric_df.columns[-5:].tolist()
            preprocessors['exam_cols'] = exam_cols
        else:
            # 测试模式使用保存的列名
            exam_cols = preprocessors['exam_cols']

    # 确保测试数据有相同的列
    available_exam_cols = [col for col in exam_cols if col in numeric_df.columns]
    if len(available_exam_cols) != len(exam_cols):
        print(f"警告: 测试数据缺少一些考试分数列，使用可用的{len(available_exam_cols)}列")

    feature_sets['考试分数'] = numeric_df[available_exam_cols].values

    # 特征集2：学生基本信息，去除年级
    basic_patterns = ['性别', 'Gender', 'sex', 'Total', '总分', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5']
    if mode == 'train':
        basic_cols = []
        for pat in basic_patterns:
            basic_cols += [col for col in numeric_df.columns if pat.lower() in col.lower()]
        basic_cols = list(dict.fromkeys(basic_cols))  # 去重
        preprocessors['basic_cols'] = basic_cols

        if not basic_cols:
            basic_cols = numeric_df.columns[:2].tolist()
            preprocessors['basic_cols'] = basic_cols
    else:
        # 测试模式使用保存的列名
        basic_cols = preprocessors['basic_cols']

    # 确保测试数据有相同的列
    available_basic_cols = [col for col in basic_cols if col in numeric_df.columns]
    feature_sets['去除年级'] = numeric_df[available_basic_cols].values

    # 特征集3：全部特征（排除programme列）
    if mode == 'train':
        programme_cols = [col for col in numeric_df.columns if 'programme' in col.lower() or 'program' in col.lower()]
        preprocessors['programme_cols'] = programme_cols
    else:
        programme_cols = preprocessors['programme_cols']

    if programme_cols:
        exclude_cols = [col for col in programme_cols if col in numeric_df.columns]
        feature_sets['全部特征'] = numeric_df.drop(columns=exclude_cols).values
    else:
        feature_sets['全部特征'] = numeric_df.values

    # 标准化处理
    for name, data in feature_sets.items():
        if mode == 'train':
            # 训练模式：创建并拟合标准化器
            scaler = StandardScaler()
            feature_sets[name] = scaler.fit_transform(data)
            preprocessors['scalers'][name] = scaler
        else:
            # 测试模式：使用已拟合的标准化器
            if name in preprocessors['scalers']:
                feature_sets[name] = preprocessors['scalers'][name].transform(data)
            else:
                print(f"错误: 没有找到特征集{name}的预处理器")

    if mode == 'train':
        return feature_sets, preprocessors
    else:
        return feature_sets

# 训练阶段


# 测试阶段(示例) - 预测时使用
# test_feature_sets = process_data(test_df, mode='test', preprocessors=preprocessors)


导入数据后，查看数据集的基本信息。处理数据缺失
## 特征转换
应用三种不同的数据转换方法：标准化缩放、PCA降维和独立成分分析(ICA)

In [None]:

# 去除索引列（如果存在）
if 'Index' in df.columns:
    df = df.drop('Index', axis=1)

print(f"数据集形状: {df.shape}")
print("\n数据集前5行:")
display(df.head())
print("\n数据集信息:")
display(df.info())
print(f"\n缺失值情况:")
display(df.isnull().sum())

# 只保留数值特征
numeric_df = df.select_dtypes(include=['float64', 'int64'])
feature_sets, preprocessors = process_data(df, mode='train')

print("创建的特征集:")
for name, features in feature_sets.items():
    print(f"- {name}: 形状 {features.shape}")

# 检查特征集并标准化
for name, data in feature_sets.items():
    print(f"\n特征集: {name}")
    print(f"数据集形状: {data.shape}")
    print("前5行:")
    print(pd.DataFrame(data[:5]))
    # 标准化
    scaler = StandardScaler()
    feature_sets[name] = scaler.fit_transform(data)
print("\n数据集统计描述:")
display(df.describe())

## 特征转换
应用三种不同的数据转换方法：标准化缩放、PCA降维和独立成分分析(ICA)

In [None]:


from sklearn.manifold import TSNE

feature_set = feature_sets
feature_sets = {}
for name, X in feature_set.items():
    # 转换1: 归一化 (MinMaxScaler)
    min_max_scaler = MinMaxScaler()
    X_minmax = min_max_scaler.fit_transform(X)
    feature_sets['归一化_' + name] = X_minmax
    print(f"{name} 已完成归一化转换")

    # 转换2: 标准化 (StandardScaler)
    std_scaler = StandardScaler()
    X_std = std_scaler.fit_transform(X)
    feature_sets['标准化_' + name] = X_std
    print(f"{name} 已完成标准化转换")

    # 转换3: 鲁棒缩放 (RobustScaler，对异常值不敏感)
    min_max_scaler = MinMaxScaler()
    X_minmax = min_max_scaler.fit_transform(X)
    feature_sets['MinMax缩放_' + name] = X_minmax
    print(f"{name} 已完成MinMax缩放转换")

feature_set = feature_sets
feature_sets = {}

for name, X_scaled in feature_set.items():
    # 转换2: PCA降维
    pca = PCA(n_components=min(X_scaled.shape[1], 10))
    X_pca = pca.fit_transform(X_scaled)
    feature_sets['PCA_' + name] = X_pca
    print(f"{name} PCA解释方差比: {pca.explained_variance_ratio_}")
    print(f"{name} PCA累计方差占比: {np.sum(pca.explained_variance_ratio_):.4f}")

    # 转换3: FastICA
    ica = FastICA(n_components=min(X_scaled.shape[1], 10), random_state=42)
    X_ica = ica.fit_transform(X_scaled)
    feature_sets['ICA_' + name] = X_ica
    print(f"{name} 已完成ICA转换")
    # 转换4: t-SNE
    tsne = TSNE(n_components=2, random_state=42, init='random', learning_rate='auto')
    X_tsne = tsne.fit_transform(X_scaled)
    feature_sets['TSNE_' + name] = X_tsne
    print(f"{name} 已完成t-SNE转换")

# 可视化所有特征集
n = len(feature_sets)
cols = 3
rows = math.ceil(n / cols)
plt.figure(figsize=(5 * cols, 5 * rows))
for i, (fname, data) in enumerate(feature_sets.items()):
    plt.subplot(rows, cols, i + 1)
    if data.shape[1] > 1:
        plt.scatter(data[:, 0], data[:, 1], alpha=0.5)
        plt.title(f"{fname} (前两个维度)")
    else:
        plt.hist(data[:, 0], bins=20)
        plt.title(f"{fname}分布")
plt.tight_layout()
plt.show()

## 聚类评估函数
定义用于评估聚类结果的性能指标函数

In [None]:
# 第四个代码单元格 - 评估函数
def evaluate_clustering(X, labels, name):
    """计算聚类性能指标"""
    try:
        silhouette = silhouette_score(X, labels)
    except:
        silhouette = -1

    try:
        db_score = davies_bouldin_score(X, labels)
    except:
        db_score = float('inf')

    try:
        ch_score = calinski_harabasz_score(X, labels)
    except:
        ch_score = -1

    return {
        'silhouette_score': silhouette,  # 越高越好
        'davies_bouldin_score': db_score,  # 越低越好
        'calinski_harabasz_score': ch_score,  # 越高越好
        'method': name
    }

## 聚类算法实现
实现三种聚类算法及其不同参数设置：
1. K-means聚类
2. 高斯混合模型(GMM)
3. 层次聚类(Hierarchical Clustering)

In [None]:
# 第五个代码单元格 - K-means聚类
def run_kmeans(X, n_clusters_range=[4], init_methods=['k-means++', 'random']):
    """运行K-means并尝试不同参数"""
    results = []

    for n_clusters in n_clusters_range:
        for init in init_methods:
            name = f"KMeans(n_clusters={n_clusters}, init={init})"
            try:
                model = KMeans(n_clusters=n_clusters, init=init, random_state=42)
                labels = model.fit_predict(X)

                # 评估结果
                result = evaluate_clustering(X, labels, name)
                result['labels'] = labels
                result['model'] = model
                result['n_clusters'] = n_clusters
                result['init'] = init

                results.append(result)
                print(f"完成: {name}")
            except Exception as e:
                print(f"错误 {name}: {str(e)}")

    return results

In [None]:
# 第六个代码单元格 - 高斯混合模型
def run_gmm(X, n_components_range=[2, 3, 4, 5, 6, 7, 8], covariance_types=['full', 'tied', 'diag', 'spherical']):
    """运行高斯混合模型并尝试不同参数"""
    results = []

    for n_components in n_components_range:
        for cov_type in covariance_types:
            name = f"GMM(n_components={n_components}, covariance_type={cov_type})"
            try:
                model = GaussianMixture(n_components=n_components, covariance_type=cov_type, random_state=42)
                labels = model.fit_predict(X)

                # 评估结果
                result = evaluate_clustering(X, labels, name)
                result['labels'] = labels
                result['model'] = model
                result['n_components'] = n_components
                result['covariance_type'] = cov_type

                results.append(result)
                print(f"完成: {name}")
            except Exception as e:
                print(f"错误 {name}: {str(e)}")

    return results

In [None]:
# 第七个代码单元格 - 层次聚类
def run_hierarchical(X, n_clusters_range=[4], linkage_methods=['ward', 'complete', 'average', 'single']):
    """运行层次聚类并尝试不同参数"""
    results = []

    for n_clusters in n_clusters_range:
        for linkage in linkage_methods:
            # ward只能用于欧氏距离
            if linkage == 'ward':
                affinity = 'euclidean'
            else:
                affinity = 'euclidean'  # 也可以尝试其他距离

            name = f"HC(n_clusters={n_clusters}, linkage={linkage})"
            try:
                model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage, affinity=affinity)
                labels = model.fit_predict(X)

                # 评估结果
                result = evaluate_clustering(X, labels, name)
                result['labels'] = labels
                result['model'] = model
                result['n_clusters'] = n_clusters
                result['linkage'] = linkage

                results.append(result)
                print(f"完成: {name}")
            except Exception as e:
                print(f"错误 {name}: {str(e)}")

    return results

## 运行聚类实验
对每种特征集运行三种聚类算法，并尝试不同的参数设置

In [None]:
# 第八个代码单元格 - 运行所有实验
# 为了限制运行时间，可以减少参数组合
n_clusters_range = [4]  # 聚类数量范围
init_methods =['k-means++']  # K-means初始化方法
covariance_types = ['full', 'tied']  # GMM协方差类型
linkage_methods = ['ward', 'complete']  # 层次聚类链接方法

# 4. 运行所有实验并收集结果
all_results = {}

for feature_name, X_transformed in feature_sets.items():
    print(f"\n处理特征集: {feature_name}")

    # 运行三种聚类算法
    kmeans_results = run_kmeans(X_transformed, n_clusters_range, init_methods)
    gmm_results = run_gmm(X_transformed, n_clusters_range, covariance_types)
    hc_results = run_hierarchical(X_transformed, n_clusters_range, linkage_methods)

    # 保存结果
    all_results[feature_name] = {
        'kmeans': kmeans_results,
        'gmm': gmm_results,
        'hierarchical': hc_results
    }

## 结果分析
找出每种特征集和每种聚类方法的最佳结果

In [None]:
# 第九个代码单元格 - 找出最佳结果
# 5. 找出每种特征集和每种聚类方法的最佳结果
best_results = {}

for feature_name, methods in all_results.items():
    best_results[feature_name] = {}

    for method_name, results in methods.items():
        if method_name in ['kmeans', 'gmm', 'hierarchical']:
            # 按silhouette_score排序（越高越好）
            sorted_results = sorted(results, key=lambda x: x['silhouette_score'], reverse=True)
            if sorted_results:
                best_results[feature_name][method_name] = sorted_results[0]

In [None]:
# 第十个代码单元格 - 创建结果表格
# 6. 创建结果表格
results_table = []

for feature_name, methods in best_results.items():
    for method_name, result in methods.items():
        row = {
            '特征集': feature_name,
            '聚类方法': method_name,
            '轮廓系数': result['silhouette_score'],
            'Davies-Bouldin': result['davies_bouldin_score'],
            'Calinski-Harabasz': result['calinski_harabasz_score']
        }

        # 添加模型特有的参数
        if method_name == 'kmeans':
            row['聚类数'] = result['n_clusters']
            row['初始化方法'] = result['init']
        elif method_name == 'gmm':
            row['聚类数/组件数'] = result['n_components']
            row['协方差类型'] = result['covariance_type']
        elif method_name == 'hierarchical':
            row['聚类数'] = result['n_clusters']
            row['链接方法'] = result['linkage']

        results_table.append(row)

# 7. 将结果转换为DataFrame并显示
results_df = pd.DataFrame(results_table)
print("\n聚类结果表:")
display(results_df)

## 可视化聚类结果
可视化展示每种特征集和聚类方法的最佳聚类结果

In [None]:
# 第十一个代码单元格 - 可视化最佳结果
# 8. 可视化每种特征集和聚类方法的最佳结果
for feature_name, methods in best_results.items():
    for method_name, result in methods.items():
        # 如果特征维度大于2，使用PCA降至2维进行可视化
        if feature_sets[feature_name].shape[1] > 2:
            vis_pca = PCA(n_components=2)
            X_vis = vis_pca.fit_transform(feature_sets[feature_name])
        else:
            X_vis = feature_sets[feature_name]

        plt.figure(figsize=(8, 6))
        plt.scatter(X_vis[:, 0], X_vis[:, 1], c=result['labels'], cmap='viridis', alpha=0.8, s=50)
        plt.title(f'特征集: {feature_name}, 聚类方法: {method_name}\n轮廓系数: {result["silhouette_score"]:.4f}')
        plt.colorbar(label='聚类标签')
        plt.tight_layout()
        plt.show()

In [None]:
# 第十二个代码单元格 - 汇总表格
# 9. 生成最佳结果的汇总表格
summary_df = pd.DataFrame(results_table)

# 按特征集和聚类方法分组，找出每种组合的最佳结果
best_by_feature = summary_df.sort_values('轮廓系数', ascending=False).groupby(['特征集', '聚类方法']).first().reset_index()

print("最佳聚类结果汇总表:")
display(best_by_feature)

# 创建热力图显示不同特征集和聚类方法的轮廓系数
pivot_table = best_by_feature.pivot(index='特征集', columns='聚类方法', values='轮廓系数')
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table, annot=True, cmap='YlGnBu', fmt='.4f')
plt.title('不同特征集和聚类方法的轮廓系数')
plt.tight_layout()
plt.show()

In [None]:
import warnings
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd

warnings.filterwarnings('ignore')

# 真实标签编码，只需一次
y_true = LabelEncoder().fit_transform(df['Programme'])

# 结果表格
results_table = []

for feature_name, methods in best_results.items():
    for method_name, result in methods.items():
        # 记录聚类评估指标
        row = {
            '特征集': feature_name,
            '聚类方法': method_name,
            '轮廓系数': result['silhouette_score'],
            'Davies-Bouldin': result['davies_bouldin_score'],
            'Calinski-Harabasz': result['calinski_harabasz_score']
        }
        if method_name == 'kmeans':
            row['聚类数'] = result['n_clusters']
            row['初始化方法'] = result['init']
        elif method_name == 'gmm':
            row['组件数'] = result['n_components']
            row['协方差类型'] = result['covariance_type']
        elif method_name == 'hierarchical':
            row['聚类数'] = result['n_clusters']
            row['链接方法'] = result['linkage']
        results_table.append(row)

        # 可视化
        X_plot = feature_sets[feature_name]
        if X_plot.shape[1] > 2:
            X_vis = PCA(n_components=2).fit_transform(X_plot)
        else:
            X_vis = X_plot
        plt.figure(figsize=(8, 6))
        plt.scatter(X_vis[:, 0], X_vis[:, 1], c=result['labels'], cmap='viridis', alpha=0.8, s=50)
        plt.title(f'特征集: {feature_name}, 聚类方法: {method_name}\n轮廓系数: {result["silhouette_score"]:.4f}')
        plt.colorbar(label='聚类标签')
        plt.tight_layout()
        plt.show()

        # 聚类与真实标签对比
        labels = result['labels']
        ari = adjusted_rand_score(y_true, labels)
        nmi = normalized_mutual_info_score(y_true, labels)
        print(f"\n特征集: {feature_name}, 聚类方法: {method_name}")
        print(f"调整兰德指数(ARI): {ari:.4f}")
        print(f"归一化互信息(NMI): {nmi:.4f}")
        print("混淆矩阵:")
        print(confusion_matrix(y_true, labels))

# 结果表格展示
results_df = pd.DataFrame(results_table)
print("\n聚类结果表:")
print(results_df)

In [None]:
from preproduce import best_model
# 导入必要的库
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, adjusted_rand_score
from evaluate import evaluate_clustering

# 1. 读取测试数据
test_df = pd.read_csv('test_data.csv')
print(f"测试数据形状: {test_df.shape}")

# 2. 确保测试特征维度与训练特征一致
X_test = test_df.select_dtypes(include=['float64', 'int64']).values
print(f"原始测试特征维度: {X_test.shape}")

# 获取最佳模型的特征数量
n_features_expected = best_model.n_features_in_  # 这是模型期望的特征数
print(f"模型期望的特征数量: {n_features_expected}")

# 调整测试数据的维度
if X_test.shape[1] > n_features_expected:
    X_test = X_test[:, :n_features_expected]  # 如果特征过多，截取前n个
elif X_test.shape[1] < n_features_expected:
    # 如果特征不足，用零填充
    padding = np.zeros((X_test.shape[0], n_features_expected - X_test.shape[1]))
    X_test = np.hstack((X_test, padding))

print(f"调整后测试特征维度: {X_test.shape}")

# 3. 标准化处理
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)

# 4. 使用模型预测
test_labels = best_model.predict(X_test_scaled)
print(f"测试数据聚类完成，标签分布: {np.unique(test_labels, return_counts=True)}")

# 5. 使用evaluate_clustering函数评估
ratio = evaluate_clustering(X=X_test_scaled, labels=test_labels)
print(f"簇内距离与簇间距离比率: {ratio:.4f}")