In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import matthews_corrcoef, classification_report, confusion_matrix, r2_score
from scipy.stats import pearsonr
import joblib
import os
import sys

In [9]:
# -------------------
# 0. 日志记录设置 (Logging Setup)
# -------------------
# 定义一个类，将print输出同时写入控制台和文件
class Logger(object):
    def __init__(self, filename="run_log_sex_age_baseline.txt"):
        self.terminal = sys.stdout
        # 使用'w'模式，每次运行都会创建一个新的日志文件，覆盖旧的。
        # 如果希望在旧文件后追加，请使用 'a' 模式。
        self.log = open(filename, "w", encoding='utf-8')

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)

    def flush(self):
        # 这个flush方法是为了python 3的兼容性。
        self.terminal.flush()
        self.log.flush()

# 将所有print输出重定向到日志文件和控制台
log_filename = "run_log_sex_age_baseline.txt"
sys.stdout = Logger(log_filename)

print(f"所有输出将被记录到文件: {log_filename}")
print("=" * 60)


In [10]:
# -------------------
# 1. 准备数据 (Data Preparation)
# -------------------
# ** 请取消下面一行的注释，并替换为您的XLSX文件路径 **
df = pd.read_csv('/Users/xym/mark/FD/CQC/ChiHope/expdata_chihope_baseline_744.csv')

# -------------------
# 2. 数据预处理 (Preprocessing)
# -------------------

# ** 修复步骤 1: 将所有列名转换为字符串类型，以避免混合类型错误 **
df.columns = df.columns.astype(str)
print("已将所有列名转换为字符串类型。")

# ** 修复步骤 2: 将目标列 'Sex' 的 'M'/'F' 映射为数值 1/0 **
if 'Sex' in df.columns:
    # 保存原始映射关系以便后续转换回来
    sex_map = {'M': 1, 'F': 0}
    # 创建反向映射
    reverse_sex_map = {v: k for k, v in sex_map.items()}
    df['Sex'] = df['Sex'].map(sex_map)
    print("已将 'Sex' 列的 'M'/'F' 转换为 1/0。")
else:
    print("错误: 数据中未找到 'Sex' 列，请检查列名。")

# ** 新增: 在主DataFrame中添加用于存放预测结果的列 **
df['prediction_sex'] = np.nan
df['prediction_age'] = np.nan

# 获取所有不重复的方法名称
methods = df['correct_method'].unique()
print(f"\n在数据中找到的方法: {methods}")
print("-" * 50)

In [12]:
methods

array(['combat_loess', 'harmony_loess', 'loess', 'log', 'median_loess',
       'ratio_loess', 'ruv_loess'], dtype=object)

In [13]:
# -------------------
# 3. 循环处理每种方法 (Loop Through Each Method)
# -------------------
for method in methods:
    print(f"\n{'='*25} 开始处理方法: {method} {'='*25}")

    # 筛选出当前方法的数据
    method_df = df[df['correct_method'] == method].copy()

    # 3.1. 划分训练集和测试集 (注意: split_label 使用数字 1 和 2)
    train_df = method_df[method_df['Training.Validation'] == 1]
    test_df = method_df[method_df['Training.Validation'] == 2]

    # 检查数据是否充足
    if train_df.empty or test_df.empty:
        print(f"方法 '{method}' 的训练集或测试集为空，跳过此方法。")
        continue

    print(f"方法 '{method}' 的完整训练集大小: {train_df.shape[0]}")
    print(f"方法 '{method}' 的测试集大小: {test_df.shape[0]}")

    # =================================================================
    #  任务一: 性别预测 (分类)
    # =================================================================
    print("\n--- [任务一: 性别预测 (分类)] ---")
    target_sex = 'Sex'
    non_feature_columns_sex = ['run_id', 'Training.Validation', 'correct_method', 'Sex', 'Age', 'prediction_sex', 'prediction_age']
    feature_columns_sex = [col for col in train_df.columns if col not in non_feature_columns_sex]

    X_train_sex = train_df[feature_columns_sex]
    y_train_sex = train_df[target_sex]
    X_test_sex = test_df[feature_columns_sex]
    y_test_sex = test_df[target_sex]

    if len(y_train_sex.unique()) < 2:
        print("性别预测任务跳过：训练集中只包含一个性别类别。")
    else:
        # 交叉验证
        model_cv_sex = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        mcc_scores = cross_val_score(model_cv_sex, X_train_sex, y_train_sex, cv=kf, scoring='matthews_corrcoef')
        print(f"交叉验证 (MCC): 平均值 = {np.mean(mcc_scores):.4f}, 标准差 = {np.std(mcc_scores):.4f}")

        # 训练并保存最终模型
        final_model_sex = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        final_model_sex.fit(X_train_sex, y_train_sex)
        model_filename_sex = f'model_sex_{method}.joblib'
        joblib.dump(final_model_sex, model_filename_sex)
        print(f"性别预测模型已保存至: {model_filename_sex}")

        # 在测试集上预测和评估
        y_pred_sex = final_model_sex.predict(X_test_sex)
        df.loc[test_df.index, 'prediction_sex'] = y_pred_sex
        test_mcc = matthews_corrcoef(y_test_sex, y_pred_sex)
        print(f"测试集评估 (MCC): {test_mcc:.4f}")
        print("测试集分类报告:\n", classification_report(y_test_sex, y_pred_sex, target_names=['F', 'M'], zero_division=0, digits=4))

    # =================================================================
    #  任务二: 年龄预测 (回归)
    # =================================================================
    print("\n--- [任务二: 年龄预测 (回归)] ---")
    target_age = 'Age'
    non_feature_columns_age = ['run_id', 'Training.Validation', 'correct_method', 'Sex', 'Age', 'prediction_sex', 'prediction_age']
    feature_columns_age = [col for col in train_df.columns if col not in non_feature_columns_age]

    X_train_age = train_df[feature_columns_age]
    y_train_age = train_df[target_age]
    X_test_age = test_df[feature_columns_age]
    y_test_age = test_df[target_age]
    
    # 交叉验证
    model_cv_age = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    kf_age = KFold(n_splits=5, shuffle=True, random_state=42)
    r2_scores = cross_val_score(model_cv_age, X_train_age, y_train_age, cv=kf_age, scoring='r2')
    print(f"交叉验证 (R-squared): 平均值 = {np.mean(r2_scores):.4f}, 标准差 = {np.std(r2_scores):.4f}")

    # 训练并保存最终模型
    final_model_age = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    final_model_age.fit(X_train_age, y_train_age)
    model_filename_age = f'model_age_{method}.joblib'
    joblib.dump(final_model_age, model_filename_age)
    print(f"年龄预测模型已保存至: {model_filename_age}")

    # 在测试集上预测和评估
    y_pred_age = final_model_age.predict(X_test_age)
    df.loc[test_df.index, 'prediction_age'] = y_pred_age
    
    # 计算皮尔逊相关性
    correlation, p_value = pearsonr(y_test_age, y_pred_age)
    print(f"测试集评估 (Pearson Correlation): 相关系数 = {correlation:.4f}, P-value = {p_value:.4f}")


print(f"\n{'='*25} 所有方法处理完毕 {'='*25}")


In [7]:
df

Unnamed: 0,correct_method,run_id,A1BG,A2M,A2ML1,AACS,AAMP,AARS,AARS2,ABCA3,...,ZFYVE19,ZG16B,ZNF148,ZNF512B,ZNF638,Age,Sex,Training.Validation,prediction_sex,prediction_age
0,combat,ExpA2,5.631370,7.164701,1.386102,3.402572,6.391766,4.334899,6.287632,4.404783,...,6.510300,3.164904,3.501054,3.381032,2.581922,58,1,2,1.0,47.39
1,combat,ExpA4,5.701895,7.057196,3.876599,3.506697,5.853177,4.837424,6.317011,4.181760,...,6.392424,3.759607,-2.891136,2.505462,4.060356,44,1,2,1.0,46.66
2,combat,ExpA6,5.921228,7.139597,4.270468,3.418220,6.568171,5.050315,6.504222,4.312902,...,6.630246,3.006391,3.885525,3.616006,3.400437,48,0,2,1.0,44.48
3,combat,ExpA8,5.761202,7.635785,4.061504,3.834170,6.098096,5.271315,6.387598,4.466988,...,7.212654,2.708469,3.767373,2.448414,3.876694,57,0,1,,
4,combat,ExpA9,5.570354,7.352956,4.092112,3.179111,8.019935,5.150036,6.893158,4.455822,...,6.822486,2.858963,4.095910,3.162631,2.694769,50,0,2,1.0,46.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2227,ruv_no_loess,ExpQ44,5.396423,7.813172,4.941434,2.567883,3.395625,5.744657,7.083699,2.882057,...,7.854679,2.953124,2.812159,2.158440,1.145704,57,0,2,0.0,54.31
2228,ruv_no_loess,ExpQ46,5.211801,7.586945,4.465600,2.768261,3.395920,4.659942,7.076960,2.867311,...,7.578833,3.237512,,2.680234,1.344783,55,1,2,1.0,49.41
2229,ruv_no_loess,ExpQ48,5.412799,7.801064,4.790246,2.385159,3.201817,4.704933,6.921073,3.109077,...,7.895327,2.481427,3.112389,1.721349,,67,0,2,1.0,50.25
2230,ruv_no_loess,ExpQ51,5.073990,8.087292,4.749989,2.467874,3.515737,5.735831,7.264649,2.657742,...,8.268554,2.934368,0.120528,2.297571,0.580684,65,1,2,0.0,54.21


In [14]:
# -------------------
# 4. 保存最终结果 (Save Final Results)
# -------------------
# ** 新增: 在保存前，将数值标签转换回原始的 'M'/'F' 字符 **
print("\n正在将数值标签转换回 'M'/'F' 以便输出...")
# 转换原始目标列
df['Sex'] = df['Sex'].map(reverse_sex_map)
# 转换预测列
df['prediction_sex'] = df['prediction_sex'].map(reverse_sex_map)

# 将带有预测结果的完整DataFrame保存到Excel文件
output_filename = 'expdata_chihope_baseline_744_with_predictions.xlsx'
df.to_excel(output_filename, index=False)
print(f"\n处理完成，带有预测结果的完整数据已保存至: {output_filename}")


In [15]:
##读取全部文件

# -------------------
# 0. 日志记录设置 (Logging Setup)
# -------------------
# 定义一个类，将print输出同时写入控制台和文件
class Logger(object):
    def __init__(self, filename="run_log_sex_age_all.txt"):
        self.terminal = sys.stdout
        # 使用'w'模式，每次运行都会创建一个新的日志文件，覆盖旧的。
        # 如果希望在旧文件后追加，请使用 'a' 模式。
        self.log = open(filename, "w", encoding='utf-8')

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)

    def flush(self):
        # 这个flush方法是为了python 3的兼容性。
        self.terminal.flush()
        self.log.flush()

# 将所有print输出重定向到日志文件和控制台
log_filename = "run_log_sex_age_all.txt"
sys.stdout = Logger(log_filename)

print(f"所有输出将被记录到文件: {log_filename}")
print("=" * 60)


In [16]:
# -------------------
# 1. 准备数据 (Data Preparation)
# -------------------
# ** 请取消下面一行的注释，并替换为您的XLSX文件路径 **
df = pd.read_csv('/Users/xym/mark/FD/CQC/ChiHope/expdata_chihope_1431.csv')

# -------------------
# 2. 数据预处理 (Preprocessing)
# -------------------

# ** 修复步骤 1: 将所有列名转换为字符串类型，以避免混合类型错误 **
df.columns = df.columns.astype(str)
print("已将所有列名转换为字符串类型。")

# ** 修复步骤 2: 将目标列 'Sex' 的 'M'/'F' 映射为数值 1/0 **
if 'Sex' in df.columns:
    # 保存原始映射关系以便后续转换回来
    sex_map = {'M': 1, 'F': 0}
    # 创建反向映射
    reverse_sex_map = {v: k for k, v in sex_map.items()}
    df['Sex'] = df['Sex'].map(sex_map)
    print("已将 'Sex' 列的 'M'/'F' 转换为 1/0。")
else:
    print("错误: 数据中未找到 'Sex' 列，请检查列名。")

# ** 新增: 在主DataFrame中添加用于存放预测结果的列 **
df['prediction_sex'] = np.nan
df['prediction_age'] = np.nan

# 获取所有不重复的方法名称
methods = df['correct_method'].unique()
print(f"\n在数据中找到的方法: {methods}")
print("-" * 50)

In [18]:
# -------------------
# 3. 循环处理每种方法 (Loop Through Each Method)
# -------------------
for method in methods:
    print(f"\n{'='*25} 开始处理方法: {method} {'='*25}")

    # 筛选出当前方法的数据
    method_df = df[df['correct_method'] == method].copy()

    # 3.1. 划分训练集和测试集 (注意: split_label 使用数字 1 和 2)
    train_df = method_df[method_df['Training.Validation'] == 1]
    test_df = method_df[method_df['Training.Validation'] == 2]

    # 检查数据是否充足
    if train_df.empty or test_df.empty:
        print(f"方法 '{method}' 的训练集或测试集为空，跳过此方法。")
        continue

    print(f"方法 '{method}' 的完整训练集大小: {train_df.shape[0]}")
    print(f"方法 '{method}' 的测试集大小: {test_df.shape[0]}")

    # =================================================================
    #  任务一: 性别预测 (分类)
    # =================================================================
    print("\n--- [任务一: 性别预测 (分类)] ---")
    target_sex = 'Sex'
    non_feature_columns_sex = ['run_id', 'Training.Validation', 'correct_method', 'Sex', 'Age', 'prediction_sex', 'prediction_age']
    feature_columns_sex = [col for col in train_df.columns if col not in non_feature_columns_sex]

    X_train_sex = train_df[feature_columns_sex]
    y_train_sex = train_df[target_sex]
    X_test_sex = test_df[feature_columns_sex]
    y_test_sex = test_df[target_sex]

    if len(y_train_sex.unique()) < 2:
        print("性别预测任务跳过：训练集中只包含一个性别类别。")
    else:
        # 交叉验证
        model_cv_sex = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        mcc_scores = cross_val_score(model_cv_sex, X_train_sex, y_train_sex, cv=kf, scoring='matthews_corrcoef')
        print(f"交叉验证 (MCC): 平均值 = {np.mean(mcc_scores):.4f}, 标准差 = {np.std(mcc_scores):.4f}")

        # 训练并保存最终模型
        final_model_sex = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        final_model_sex.fit(X_train_sex, y_train_sex)
        model_filename_sex = f'model_sex_{method}.joblib'
        joblib.dump(final_model_sex, model_filename_sex)
        print(f"性别预测模型已保存至: {model_filename_sex}")

        # 在测试集上预测和评估
        y_pred_sex = final_model_sex.predict(X_test_sex)
        df.loc[test_df.index, 'prediction_sex'] = y_pred_sex
        test_mcc = matthews_corrcoef(y_test_sex, y_pred_sex)
        print(f"测试集评估 (MCC): {test_mcc:.4f}")
        print("测试集分类报告:\n", classification_report(y_test_sex, y_pred_sex, target_names=['F', 'M'], zero_division=0, digits=4))

    # =================================================================
    #  任务二: 年龄预测 (回归)
    # =================================================================
    print("\n--- [任务二: 年龄预测 (回归)] ---")
    target_age = 'Age'
    non_feature_columns_age = ['run_id', 'Training.Validation', 'correct_method', 'Sex', 'Age', 'prediction_sex', 'prediction_age']
    feature_columns_age = [col for col in train_df.columns if col not in non_feature_columns_age]

    X_train_age = train_df[feature_columns_age]
    y_train_age = train_df[target_age]
    X_test_age = test_df[feature_columns_age]
    y_test_age = test_df[target_age]
    
    # 交叉验证
    model_cv_age = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    kf_age = KFold(n_splits=5, shuffle=True, random_state=42)
    r2_scores = cross_val_score(model_cv_age, X_train_age, y_train_age, cv=kf_age, scoring='r2')
    print(f"交叉验证 (R-squared): 平均值 = {np.mean(r2_scores):.4f}, 标准差 = {np.std(r2_scores):.4f}")

    # 训练并保存最终模型
    final_model_age = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    final_model_age.fit(X_train_age, y_train_age)
    model_filename_age = f'model_age_{method}.joblib'
    joblib.dump(final_model_age, model_filename_age)
    print(f"年龄预测模型已保存至: {model_filename_age}")

    # 在测试集上预测和评估
    y_pred_age = final_model_age.predict(X_test_age)
    df.loc[test_df.index, 'prediction_age'] = y_pred_age
    
    # 计算皮尔逊相关性
    correlation, p_value = pearsonr(y_test_age, y_pred_age)
    print(f"测试集评估 (Pearson Correlation): 相关系数 = {correlation:.4f}, P-value = {p_value:.4f}")


print(f"\n{'='*25} 所有方法处理完毕 {'='*25}")


In [19]:
df

Unnamed: 0,correct_method,run_id,A1BG,A2M,A2ML1,AACS,AAMP,AARS,AARS2,ABCA3,...,ZFYVE19,ZG16B,ZNF148,ZNF512B,ZNF638,Age,Sex,Training.Validation,prediction_sex,prediction_age
0,combat_loess,ExpA2,5.631370,7.164701,1.386102,3.402572,6.391766,4.334899,6.287632,4.404783,...,6.510300,3.164904,3.501054,3.381032,2.581922,58,1,2,1.0,46.29
1,combat_loess,ExpA3,5.868512,7.365557,4.095377,3.443147,6.162394,5.177449,6.366564,4.438152,...,6.796363,3.278597,3.528209,3.422229,4.150686,58,1,2,1.0,51.20
2,combat_loess,ExpA4,5.701895,7.057196,3.876599,3.506697,5.853177,4.837424,6.317011,4.181760,...,6.392424,3.759607,-2.891136,2.505462,4.060356,44,1,2,1.0,42.43
3,combat_loess,ExpA5,5.818974,7.046358,4.020845,3.168691,6.120512,4.740684,6.361593,4.368293,...,6.436529,3.492254,3.182743,2.430062,3.671002,44,1,2,1.0,43.66
4,combat_loess,ExpA6,5.921228,7.139597,4.270468,3.418220,6.568171,5.050315,6.504222,4.312902,...,6.630246,3.006391,3.885525,3.616006,3.400437,48,0,2,0.0,43.51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10012,ruv_loess,ExpQ50,4.985625,7.451085,3.581059,0.483646,7.260544,7.188196,4.363194,3.058346,...,9.751542,3.705991,4.737530,-2.739934,-0.311568,67,0,2,0.0,53.36
10013,ruv_loess,ExpQ51,4.563926,7.468974,3.323261,0.025297,7.752680,7.322760,4.482586,3.034892,...,10.102656,3.799080,1.984601,-3.258707,-1.193413,65,1,2,1.0,57.49
10014,ruv_loess,ExpQ52,4.668953,7.598232,3.212252,-0.012094,7.944819,7.368172,4.499934,3.022390,...,10.304843,4.311394,,-2.898897,-0.591543,65,1,2,1.0,55.60
10015,ruv_loess,ExpQ53,4.717569,6.916047,2.915152,-0.069940,7.525003,6.918785,3.915385,3.451905,...,9.416029,4.316223,5.031772,-3.239274,-0.800093,52,1,2,1.0,45.46


In [20]:
# -------------------
# 4. 保存最终结果 (Save Final Results)
# -------------------
# ** 新增: 在保存前，将数值标签转换回原始的 'M'/'F' 字符 **
print("\n正在将数值标签转换回 'M'/'F' 以便输出...")
# 转换原始目标列
df['Sex'] = df['Sex'].map(reverse_sex_map)
# 转换预测列
df['prediction_sex'] = df['prediction_sex'].map(reverse_sex_map)

# 将带有预测结果的完整DataFrame保存到Excel文件
output_filename = 'expdata_chihope_1431_with_predictions.xlsx'
df.to_excel(output_filename, index=False)
print(f"\n处理完成，带有预测结果的完整数据已保存至: {output_filename}")

In [21]:
# -------------------
# 0. 日志记录设置 (Logging Setup)
# -------------------
# 定义一个类，将print输出同时写入控制台和文件
class Logger(object):
    def __init__(self, filename="run_log_sex_age_neg.txt"):
        self.terminal = sys.stdout
        # 使用'w'模式，每次运行都会创建一个新的日志文件，覆盖旧的。
        # 如果希望在旧文件后追加，请使用 'a' 模式。
        self.log = open(filename, "w", encoding='utf-8')

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)

    def flush(self):
        # 这个flush方法是为了python 3的兼容性。
        self.terminal.flush()
        self.log.flush()

# 将所有print输出重定向到日志文件和控制台
log_filename = "run_log_sex_age_neg.txt"
sys.stdout = Logger(log_filename)

print(f"所有输出将被记录到文件: {log_filename}")
print("=" * 60)


In [22]:
# -------------------
# 1. 准备数据 (Data Preparation)
# -------------------
# ** 请取消下面一行的注释，并替换为您的XLSX文件路径 **
df = pd.read_excel('/Users/xym/mark/FD/CQC/ChiHope/expdata_chihope_1431_negctrl.xlsx')

# -------------------
# 2. 数据预处理 (Preprocessing)
# -------------------

# ** 修复步骤 1: 将所有列名转换为字符串类型，以避免混合类型错误 **
df.columns = df.columns.astype(str)
print("已将所有列名转换为字符串类型。")

# ** 修复步骤 2: 将目标列 'Sex' 的 'M'/'F' 映射为数值 1/0 **
if 'Sex' in df.columns:
    # 保存原始映射关系以便后续转换回来
    sex_map = {'M': 1, 'F': 0}
    # 创建反向映射
    reverse_sex_map = {v: k for k, v in sex_map.items()}
    df['Sex'] = df['Sex'].map(sex_map)
    print("已将 'Sex' 列的 'M'/'F' 转换为 1/0。")
else:
    print("错误: 数据中未找到 'Sex' 列，请检查列名。")

# ** 新增: 在主DataFrame中添加用于存放预测结果的列 **
df['prediction_sex'] = np.nan
df['prediction_age'] = np.nan

# 获取所有不重复的方法名称
methods = df['correct_method'].unique()
print(f"\n在数据中找到的方法: {methods}")
print("-" * 50)

In [23]:
df

Unnamed: 0,correct_method,run_id,A1BG,A2M,A2ML1,AACS,AAMP,AARS,AARS2,ABCA3,...,ZFYVE19,ZG16B,ZNF148,ZNF512B,ZNF638,Age,Sex,Training.Validation,prediction_sex,prediction_age
0,log,ExpA2,18.098910,19.638806,,15.987535,18.670162,16.723185,18.501698,17.060643,...,19.287601,15.433026,16.423755,16.843425,15.128722,40,0,2,,
1,log,ExpA3,18.256032,19.771687,16.630234,15.953287,18.525860,17.309189,18.526085,17.018298,...,19.479755,15.455679,16.374659,16.810277,16.258210,56,0,2,,
2,log,ExpA4,18.172756,19.536170,16.371951,16.084385,18.499520,17.121159,18.539814,16.898173,...,19.179086,15.909352,,15.913029,16.267580,32,1,2,,
3,log,ExpA5,18.527940,19.768262,16.830763,16.030626,18.837158,17.289779,18.835277,17.286368,...,19.460265,15.948144,16.474609,16.069774,16.216426,46,0,2,,
4,log,ExpA6,18.296131,19.532490,16.876280,15.919630,18.663392,17.201448,18.676849,16.915571,...,19.306966,15.244031,16.604237,16.982623,15.683017,36,0,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1426,log,ExpQ50,18.453726,20.304608,16.595173,16.034439,18.800000,17.606768,18.828503,16.752718,...,19.602083,15.319597,17.065614,15.545968,16.431887,45,0,2,,
1427,log,ExpQ51,18.169267,20.513318,16.599181,15.859290,19.087761,17.803684,19.154696,16.721291,...,19.921268,15.418900,14.092665,15.326373,15.721203,43,1,2,,
1428,log,ExpQ52,18.061157,20.450459,16.423968,15.658210,18.975445,17.560255,19.006772,16.516375,...,19.878635,15.645360,,15.629847,16.112043,56,1,2,,
1429,log,ExpQ53,18.250182,19.894954,16.179257,15.761935,18.661255,17.249217,18.564322,17.066602,...,19.088556,15.775522,16.934349,15.438533,16.085313,51,1,2,,


In [24]:
# -------------------
# 3. 循环处理每种方法 (Loop Through Each Method)
# -------------------
for method in methods:
    print(f"\n{'='*25} 开始处理方法: {method} {'='*25}")

    # 筛选出当前方法的数据
    method_df = df[df['correct_method'] == method].copy()

    # 3.1. 划分训练集和测试集 (注意: split_label 使用数字 1 和 2)
    train_df = method_df[method_df['Training.Validation'] == 1]
    test_df = method_df[method_df['Training.Validation'] == 2]

    # 检查数据是否充足
    if train_df.empty or test_df.empty:
        print(f"方法 '{method}' 的训练集或测试集为空，跳过此方法。")
        continue

    print(f"方法 '{method}' 的完整训练集大小: {train_df.shape[0]}")
    print(f"方法 '{method}' 的测试集大小: {test_df.shape[0]}")

    # =================================================================
    #  任务一: 性别预测 (分类)
    # =================================================================
    print("\n--- [任务一: 性别预测 (分类)] ---")
    target_sex = 'Sex'
    non_feature_columns_sex = ['run_id', 'Training.Validation', 'correct_method', 'Sex', 'Age', 'prediction_sex', 'prediction_age']
    feature_columns_sex = [col for col in train_df.columns if col not in non_feature_columns_sex]

    X_train_sex = train_df[feature_columns_sex]
    y_train_sex = train_df[target_sex]
    X_test_sex = test_df[feature_columns_sex]
    y_test_sex = test_df[target_sex]

    if len(y_train_sex.unique()) < 2:
        print("性别预测任务跳过：训练集中只包含一个性别类别。")
    else:
        # 交叉验证
        model_cv_sex = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        mcc_scores = cross_val_score(model_cv_sex, X_train_sex, y_train_sex, cv=kf, scoring='matthews_corrcoef')
        print(f"交叉验证 (MCC): 平均值 = {np.mean(mcc_scores):.4f}, 标准差 = {np.std(mcc_scores):.4f}")

        # 训练并保存最终模型
        final_model_sex = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        final_model_sex.fit(X_train_sex, y_train_sex)
        model_filename_sex = f'model_sex_{method}.joblib'
        joblib.dump(final_model_sex, model_filename_sex)
        print(f"性别预测模型已保存至: {model_filename_sex}")

        # 在测试集上预测和评估
        y_pred_sex = final_model_sex.predict(X_test_sex)
        df.loc[test_df.index, 'prediction_sex'] = y_pred_sex
        test_mcc = matthews_corrcoef(y_test_sex, y_pred_sex)
        print(f"测试集评估 (MCC): {test_mcc:.4f}")
        print("测试集分类报告:\n", classification_report(y_test_sex, y_pred_sex, target_names=['F', 'M'], zero_division=0, digits=4))

    # =================================================================
    #  任务二: 年龄预测 (回归)
    # =================================================================
    print("\n--- [任务二: 年龄预测 (回归)] ---")
    target_age = 'Age'
    non_feature_columns_age = ['run_id', 'Training.Validation', 'correct_method', 'Sex', 'Age', 'prediction_sex', 'prediction_age']
    feature_columns_age = [col for col in train_df.columns if col not in non_feature_columns_age]

    X_train_age = train_df[feature_columns_age]
    y_train_age = train_df[target_age]
    X_test_age = test_df[feature_columns_age]
    y_test_age = test_df[target_age]
    
    # 交叉验证
    model_cv_age = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    kf_age = KFold(n_splits=5, shuffle=True, random_state=42)
    r2_scores = cross_val_score(model_cv_age, X_train_age, y_train_age, cv=kf_age, scoring='r2')
    print(f"交叉验证 (R-squared): 平均值 = {np.mean(r2_scores):.4f}, 标准差 = {np.std(r2_scores):.4f}")

    # 训练并保存最终模型
    final_model_age = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    final_model_age.fit(X_train_age, y_train_age)
    model_filename_age = f'model_age_{method}.joblib'
    joblib.dump(final_model_age, model_filename_age)
    print(f"年龄预测模型已保存至: {model_filename_age}")

    # 在测试集上预测和评估
    y_pred_age = final_model_age.predict(X_test_age)
    df.loc[test_df.index, 'prediction_age'] = y_pred_age
    
    # 计算皮尔逊相关性
    correlation, p_value = pearsonr(y_test_age, y_pred_age)
    print(f"测试集评估 (Pearson Correlation): 相关系数 = {correlation:.4f}, P-value = {p_value:.4f}")


print(f"\n{'='*25} 所有方法处理完毕 {'='*25}")


In [25]:
# -------------------
# 4. 保存最终结果 (Save Final Results)
# -------------------
# ** 新增: 在保存前，将数值标签转换回原始的 'M'/'F' 字符 **
print("\n正在将数值标签转换回 'M'/'F' 以便输出...")
# 转换原始目标列
df['Sex'] = df['Sex'].map(reverse_sex_map)
# 转换预测列
df['prediction_sex'] = df['prediction_sex'].map(reverse_sex_map)

# 将带有预测结果的完整DataFrame保存到Excel文件
output_filename = 'expdata_chihope_1431_neg_with_predictions.xlsx'
df.to_excel(output_filename, index=False)
print(f"\n处理完成，带有预测结果的完整数据已保存至: {output_filename}")