In [3]:
# 著名仓库对比数据提取 //大型仓库数据量少，在算spearman相关系数的时候无法有差异化结果，所以暂不以其作为表格结果
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LinearRegression

# ================= 配置区域 =================
repoName = ['vue', 'tensorflow', 'autogpt', 'kubernetes', 'terminal',
            'flutter', 'vscode', 'react-naive', 'electron', 'transformers']

# 权重定义
ISSUECOMMENT_WEIGHT = 0.5252
OPENISSUE_WEIGHT = 2.2235
OPENPR_WEIGHT = 4.0679
PRREVIEWCOMMENT_WEIGHT = 0.7427
MERGEPR_WEIGHT = 2.0339
WEIGHT = np.array([ISSUECOMMENT_WEIGHT, OPENISSUE_WEIGHT, OPENPR_WEIGHT, PRREVIEWCOMMENT_WEIGHT, MERGEPR_WEIGHT])

# 常量定义
PRED_LEN = 84
SEQ_LEN = 84  # 历史回溯长度
WEIGHT_SUM = np.sum(WEIGHT) # 约为 9.5932
MIN_SCORE_STD = WEIGHT_SUM * 0.5 # Sigmoid(0)=0.5, 4.7966
MAX_SCORE_STD = WEIGHT_SUM * 1.0 # Sigmoid(inf)=1.0, 9.5932

# ================= 工具函数 =================
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def calculate_lte(history_data):
    """
    计算线性趋势外推 (Linear Trend Extrapolation)
    :param history_data: shape (84, 5)
    :return: 预测未来84步的均值
    """
    n_steps, n_features = history_data.shape
    X = np.arange(n_steps).reshape(-1, 1) # [0, 1, ..., 83]
    X_future = np.arange(n_steps, n_steps * 2).reshape(-1, 1) # [84, ..., 167]
    
    predictions = []
    model = LinearRegression()
    
    # 对5个特征分别拟合
    for i in range(n_features):
        y = history_data.iloc[:, i].values
        model.fit(X, y)
        pred_future = model.predict(X_future)
        predictions.append(pred_future.mean())
        
    return np.mean(predictions)

# ================= 主逻辑 =================
results = []

print(f"{'Repository':<15} | {'Status':<10} | {'Info'}")
print("-" * 50)

for name in repoName:
    # 1. 构建文件路径
    # 注意：这里假设你的 dataset 文件夹结构为 dataset/{name}/{name}_all_roll_ewma_span28_normalize.csv
    csv_path = f'dataset/{name}/{name}_all_roll_ewma_span28_normalize.csv'
    pred_path = f'results/stacking_attention_fusion_PatchTST_Dlinear_custom_{name}_ftM_sl84_ll84_pl84_dm512_nh8_el2_dl1_df2048_expand2_dc4_fc3_ebtimeF_dtTrue_Exp_0/pred.npy'

    # 2. 检查文件是否存在
    if not os.path.exists(pred_path):
        print(f"{name:<15} | MISSING    | Pred file not found, skipping...")
        continue
    
    if not os.path.exists(csv_path):
        print(f"{name:<15} | MISSING    | CSV file not found, skipping...")
        continue

    try:
        # 3. 加载数据
        pred = np.load(pred_path)
        # 提取第一个测试样本的预测值 (假设 pred shape 为 [Batch, 84, 5])
        # 如果只想分析特定样本，这里索引保持为 0
        pred_data = pred[0, :PRED_LEN, :5] 

        df = pd.read_csv(csv_path)
        if 'date' in df.columns:
            df = df.drop(columns=['date'])

        # 4. 确定时间窗口索引
        dataset_len = len(df)
        num_test = int(dataset_len * 0.2)
        
        # 预测窗口 (Future / Ground Truth)
        start_index = dataset_len - num_test
        end_index = start_index + PRED_LEN
        
        # 历史窗口 (History / Baseline Input)
        hist_start = start_index - SEQ_LEN
        hist_end = start_index

        # 边界检查
        if hist_start < 0:
            print(f"{name:<15} | WARNING    | History window out of bounds, skipping...")
            continue

        # 提取序列
        true_data = df.iloc[start_index:end_index] # 未来真实值
        hist_data = df.iloc[hist_start:hist_end]   # 历史输入值

        # ================= 计算指标 =================
        
        # --- A. 我们的指标 (Ours) ---
        
        # 1. DTA Standard (标准版)
        # 逻辑: Mean -> Sigmoid -> Weighted Sum -> Scale
        mean_values = np.mean(pred_data, axis=0)
        sigmoid_values = sigmoid(mean_values)
        weighted_sum_std = np.dot(sigmoid_values, WEIGHT)
        # 放缩公式: (x - min) / min * 100  (你的代码逻辑)
        # 注意: 4.7966 是 min_score (sum(w)*0.5)
        dta_standard = ((weighted_sum_std - MIN_SCORE_STD) / MIN_SCORE_STD) * 100

        # 2. DTA Linear (消融实验: 去除 Sigmoid)
        # 逻辑: Normalize(0-1) -> Mean -> Weighted Sum -> Scale
        # 原始数据范围 0-6，除以 6 归一化到 0-1
        norm_values = np.mean(pred_data / 6.0, axis=0)
        weighted_sum_lin = np.dot(norm_values, WEIGHT)
        # Range: 0 to WEIGHT_SUM. Scale to 0-100.
        dta_linear = (weighted_sum_lin / WEIGHT_SUM) * 100

        # 3. DTA NoWeight (消融实验: 去除权重)
        # 逻辑: Mean -> Sigmoid -> Simple Avg -> Scale
        # 简单平均相当于权重为 0.2
        simple_avg_sigmoid = np.mean(sigmoid_values) 
        # Range: 0.5 to 1.0. Scale to 0-100.
        dta_noweight = ((simple_avg_sigmoid - 0.5) / 0.5) * 100

        # --- B. 外部真理 (Ground Truth) ---
        
        # 1. GT Intensity (FASI - 强度)
        # 定义为未来真实值的均值 (反映活跃水平)
        gt_intensity = true_data.values.mean()
        
        # 2. GT Slope (你代码中的 FASI，反映增长趋势)
        # 保留你的原始逻辑作为参考
        total_changes = true_data.iloc[-1] - true_data.iloc[0]
        gt_slope = (total_changes / (len(true_data) - 1)).mean()

        # 3. GT Stability (1/CV)
        # 变异系数倒数. 加上 1e-6 防止除零
        means = true_data.mean()
        stds = true_data.std()
        cvs = stds / (means + 1e-6)
        # 取所有特征 CV 的平均值的倒数，或者平均稳定性
        # 这里计算平均 CV 然后取倒数
        avg_cv = cvs.mean()
        gt_stability = 1 / (avg_cv + 1e-6)

        total_reviews = true_data.iloc[:, 3].sum()  # ReviewComment 是第4列(索引3)
        total_merges = true_data.iloc[:, 4].sum()   # MergePR 是第5列(索引4)
        chaoss_review_intensity = total_reviews / (total_merges + 1e-9)  # 添加小值防止除零

        # --- C. 基准指标 (Baselines) ---
        
        # 1. Base HAA (历史平均)
        base_haa = hist_data.values.mean()

        # 2. Base RM (近期动量 - 过去2周)
        base_rm = hist_data.iloc[-28:].values.mean()

        # 3. Base LTE (线性趋势外推)
        base_lte = calculate_lte(hist_data)

        # ================= 存储结果 =================
        result_dict = {
            'Repo_Name': name,
            # Ours
            'DTA_Standard': dta_standard,
            'DTA_Linear': dta_linear,
            'DTA_NoWeight': dta_noweight,
            # Ground Truths
            'GT_Intensity_Mean': gt_intensity,
            'GT_Trend_Slope': gt_slope,
            'GT_Stability_InvCV': gt_stability,
            'CHAOSS_Review_Intensity': chaoss_review_intensity,
            # Baselines
            'Base_HAA': base_haa,
            'Base_RM': base_rm,
            'Base_LTE': base_lte
        }
        
        results.append(result_dict)
        print(f"{name:<15} | SUCCESS    | DTA: {dta_standard:.2f}")

    except Exception as e:
        print(f"{name:<15} | ERROR      | {str(e)}")

# ================= 保存文件 =================
if results:
    output_dir = 'workresult/stacking/big_repo_DTA_verification'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    results_df = pd.DataFrame(results)
    
    # 按 DTA 分数排序，方便观察
    results_df = results_df.sort_values(by='DTA_Standard', ascending=False)
    
    # 调整列顺序
    cols = ['Repo_Name', 'DTA_Standard', 'DTA_Linear', 'DTA_NoWeight', 
            'GT_Intensity_Mean', 'GT_Trend_Slope', 'GT_Stability_InvCV', 'CHAOSS_Review_Intensity',
            'Base_HAA', 'Base_RM', 'Base_LTE']
    results_df = results_df[cols]
    
    save_path = os.path.join(output_dir, 'experiment_master_data.csv')
    results_df.to_csv(save_path, index=False)
    print("\n" + "="*50)
    print(f"所有数据处理完成！结果已保存至: {save_path}")
    print(f"成功处理仓库数量: {len(results_df)}")
else:
    print("\n未生成任何结果，请检查文件路径配置。")

Repository      | Status     | Info
--------------------------------------------------
vue             | SUCCESS    | DTA: 52.37
tensorflow      | MISSING    | Pred file not found, skipping...
autogpt         | MISSING    | Pred file not found, skipping...
kubernetes      | SUCCESS    | DTA: 77.24
terminal        | MISSING    | Pred file not found, skipping...
flutter         | MISSING    | Pred file not found, skipping...
vscode          | SUCCESS    | DTA: 92.29
react-naive     | SUCCESS    | DTA: 54.48
electron        | MISSING    | Pred file not found, skipping...
transformers    | SUCCESS    | DTA: 96.53

所有数据处理完成！结果已保存至: workresult/stacking/big_repo_DTA_verification/experiment_master_data.csv
成功处理仓库数量: 5


In [None]:
# 100个数据集的对比比较

import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LinearRegression
import re

# ================= 1. 配置区域 =================
folder_path = f"dataset/random100_from_21-23-dataset"
# 正则匹配逻辑
pattern = r"^(.*?)(?=(_all_roll_ewma_span28_normalize))"
repoName = []

# 增加文件夹存在性检查
if os.path.exists(folder_path):
    for filename in os.listdir(folder_path):
        if os.path.isfile(os.path.join(folder_path, filename)):
            match = re.search(pattern, filename)
            if match:
                prefix = match.group(1)
                repoName.append(prefix)
else:
    print(f"Error: Folder {folder_path} not found.")

# 权重定义 (对应列顺序: [IssueComment, OpenIssue, OpenPR, ReviewComment, MergePR])
ISSUECOMMENT_WEIGHT = 0.5252
OPENISSUE_WEIGHT = 2.2235
OPENPR_WEIGHT = 4.0679
PRREVIEWCOMMENT_WEIGHT = 0.7427
MERGEPR_WEIGHT = 2.0339
WEIGHT = np.array([ISSUECOMMENT_WEIGHT, OPENISSUE_WEIGHT, OPENPR_WEIGHT, PRREVIEWCOMMENT_WEIGHT, MERGEPR_WEIGHT])

# 常量定义
PRED_LEN = 84
SEQ_LEN = 84  # 历史回溯长度
WEIGHT_SUM = np.sum(WEIGHT)
MIN_SCORE_STD = WEIGHT_SUM * 0.5 
MAX_SCORE_STD = WEIGHT_SUM * 1.0 

# ================= 2. 工具函数 =================
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def calculate_lte(history_data):
    """线性趋势外推 (Linear Trend Extrapolation)"""
    n_steps, n_features = history_data.shape
    X = np.arange(n_steps).reshape(-1, 1)
    X_future = np.arange(n_steps, n_steps * 2).reshape(-1, 1)
    
    predictions = []
    model = LinearRegression()
    
    for i in range(n_features):
        y = history_data.iloc[:, i].values
        model.fit(X, y)
        pred_future = model.predict(X_future)
        predictions.append(pred_future.mean())
        
    return np.mean(predictions)

# ================= 3. 主逻辑 =================
results = []

print(f"{'Repository':<20} | {'Status':<10} | {'Info'}")
print("-" * 60)

for name in repoName:
    # --- 文件路径构建 ---
    csv_path = f'{folder_path}/{name}_all_roll_ewma_span28_normalize.csv'
    pred_path = f'results/stacking_attention_fusion_PatchTST_DLinear_custom_{name}_ftM_sl84_ll84_pl84_dm512_nh8_el2_dl1_df2048_expand2_dc4_fc3_ebtimeF_dtTrue_Exp_0/pred.npy'

    # --- 检查文件 ---
    if not os.path.exists(pred_path):
        continue
    if not os.path.exists(csv_path):
        print(f"{name:<20} | MISSING    | CSV file not found")
        continue

    try:
        # --- 加载数据 ---
        pred = np.load(pred_path)
        pred_data = pred[0, :PRED_LEN, :5] # 模型预测出的未来数据 (用于计算 DTA)

        df = pd.read_csv(csv_path)
        if 'date' in df.columns:
            df = df.drop(columns=['date'])

        # --- 确定时间窗口 ---
        dataset_len = len(df)
        num_test = int(dataset_len * 0.2)
        
        start_index = dataset_len - num_test
        end_index = start_index + PRED_LEN
        hist_start = start_index - SEQ_LEN
        hist_end = start_index

        if hist_start < 0:
            continue

        true_data = df.iloc[start_index:end_index] # 未来真实值 (Ground Truth & CHAOSS)
        hist_data = df.iloc[hist_start:hist_end]   # 历史输入值 (Baselines)

        # ================= A. 计算 DTA 及其变体 (Ours) =================
        mean_values = np.mean(pred_data, axis=0) # Shape: (5,)
        
        # 1. DTA Standard (Sigmoid + AHP)
        sigmoid_vals = sigmoid(mean_values)
        w_sum_std = np.dot(sigmoid_vals, WEIGHT)
        dta_standard = ((w_sum_std - MIN_SCORE_STD) / MIN_SCORE_STD) * 100

        # 2. DTA Linear (无非线性)
        norm_vals = np.mean(pred_data / 6.0, axis=0)
        dta_linear = (np.dot(norm_vals, WEIGHT) / WEIGHT_SUM) * 100

        # 3. DTA NoWeight (Sigmoid + 等权重)
        dta_noweight = ((np.mean(sigmoid_vals) - 0.5) / 0.5) * 100

        # 4. DTA Log (对数非线性)
        dta_log = np.dot(np.log1p(mean_values), WEIGHT)

        # 5. DTA Tanh (双曲正切非线性)
        dta_tanh = (np.dot(np.tanh(mean_values), WEIGHT) / WEIGHT_SUM) * 100

        # ================= B. 计算 CHAOSS 金指标 (Gold Standards) =================
        # 基于 true_data (未来真实发生的行为) 计算比率
        # 列索引映射 (基于 WEIGHT 定义): 
        # 0:IssueComment, 1:OpenIssue, 2:OpenPR, 3:ReviewComment, 4:MergePR
        
        # 1. Code Review Intensity (代码审查强度)
        # 公式: ReviewComment / MergePR
        # 含义: 代码合入前的平均审查力度，反映质量控制。
        total_reviews = true_data.iloc[:, 3].sum()
        total_merges = true_data.iloc[:, 4].sum()
        gold_review_intensity = total_reviews / (total_merges + 1e-9)

        # 2. Issue Response Density (问题响应密度)
        # 公式: IssueComment / OpenIssue
        # 含义: 社区对每个问题的平均关注度，反映维护活性。
        mean_issue_comments = true_data.iloc[:, 0].mean()
        mean_open_issues = true_data.iloc[:, 1].mean()
        gold_issue_density = mean_issue_comments / (mean_open_issues + 1e-9)

        # ================= C. 计算传统真理指标 (Ground Truths) =================
        
        # 1. GT Intensity (未来均值)
        gt_intensity = true_data.values.mean()
        
        # 2. GT Slope (趋势斜率)
        total_changes = true_data.iloc[-1] - true_data.iloc[0]
        gt_slope = (total_changes / (len(true_data) - 1)).mean()

        # 3. GT Stability (1/CV)
        means = true_data.mean()
        stds = true_data.std()
        cvs = stds / (means + 1e-9)
        gt_stability = 1 / (cvs.mean() + 1e-9)

        # ================= D. 计算基准指标 (Baselines) =================
        
        # 1. Base HAA (历史平均)
        base_haa = hist_data.values.mean()

        # 2. Base RM (近期动量 - 过去28天)
        base_rm = hist_data.iloc[-28:].values.mean()

        # 3. Base LTE (线性趋势外推)
        base_lte = calculate_lte(hist_data)

        # ================= 存储结果 =================
        result_dict = {
            'Repo_Name': name,
            # --- Ours ---
            'DTA_Standard': dta_standard,
            'DTA_Linear': dta_linear,
            'DTA_NoWeight': dta_noweight,
            'DTA_Log': dta_log,
            'DTA_Tanh': dta_tanh,
            # --- CHAOSS Gold Standards ---
            'CHAOSS_Review_Intensity': gold_review_intensity,
            'CHAOSS_Issue_Density': gold_issue_density,
            # --- Traditional GT ---
            'GT_Intensity_Mean': gt_intensity,
            'GT_Trend_Slope': gt_slope,
            'GT_Stability_InvCV': gt_stability,
            # --- Baselines ---
            'Base_HAA': base_haa,
            'Base_RM': base_rm,
            'Base_LTE': base_lte
        }
        
        results.append(result_dict)

    except Exception as e:
        print(f"{name:<20} | ERROR      | {str(e)}")

# ================= 4. 保存文件 =================
if results:
    output_dir = 'workresult/stacking/100dataset_DTA_verification'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    results_df = pd.DataFrame(results)
    
    # 明确列顺序，方便查看
    cols = [
        'Repo_Name', 
        # DTA Variants
        'DTA_Standard', 'DTA_Linear', 'DTA_NoWeight', 'DTA_Log', 'DTA_Tanh',
        # CHAOSS Metrics
        'CHAOSS_Review_Intensity', 'CHAOSS_Issue_Density',
        # Traditional GT
        'GT_Intensity_Mean', 'GT_Trend_Slope', 'GT_Stability_InvCV', 
        # Baselines
        'Base_HAA', 'Base_RM', 'Base_LTE'
    ]
    # 过滤掉 dataframe 中不存在的列 (防止报错)
    cols = [c for c in cols if c in results_df.columns]
    results_df = results_df[cols]
    
    # 按标准版分数排序
    results_df = results_df.sort_values(by='DTA_Standard', ascending=False)
    
    # 清洗异常值 (比如分母为0导致的 infinite)
    results_df = results_df.replace([np.inf, -np.inf], np.nan)
    
    save_path = os.path.join(output_dir, 'experiment_master_data_chaoss.csv')
    results_df.to_csv(save_path, index=False)
    
    print("\n" + "="*60)
    print(f"数据处理完成！")
    print(f"有效样本数: {len(results_df)}")
    print(f"文件已保存至: {save_path}")
    
    # 预览关键列
    print("\n前5行预览 (包含 CHAOSS 指标):")
    print(results_df[['Repo_Name', 'DTA_Standard', 'CHAOSS_Review_Intensity', 'CHAOSS_Issue_Density']].head())
else:
    print("\n没有生成任何结果。请检查 dataset 和 results 路径是否匹配。")

Repository           | Status     | Info
------------------------------------------------------------

数据处理完成！
有效样本数: 100
文件已保存至: workresult/stacking/100dataset_DTA_verification/experiment_master_data_chaoss.csv

前5行预览 (包含 CHAOSS 指标):
     Repo_Name  DTA_Standard  CHAOSS_Review_Intensity  CHAOSS_Issue_Density
26       ImHex     97.814601                 0.427706              0.933574
85     airflow     96.797583                 0.840759              0.518676
16    TDengine     96.359352                 0.742231              0.866947
72        node     96.160136                 0.703533              1.574659
15  localstack     95.870200                 0.974540              1.477044


In [6]:
# DTA验证结果对比
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

# 1. 读取数据
file_path = 'workresult/stacking/100dataset_DTA_verification/experiment_master_data.csv'

try:
    df = pd.read_csv(file_path)
    print(f"成功加载数据，共 {len(df)} 个有效仓库样本。")
except FileNotFoundError:
    print(f"错误：找不到文件 {file_path}，请确保上一步的数据生成代码已成功运行。")
    # 为了演示，这里创建一个伪造的 dataframe (实际运行时请注释掉这一块)
    # df = pd.DataFrame(...) 
    # df = None

if df is not None and not df.empty:
    # 定义辅助函数：计算 Spearman 相关性并返回格式化的字符串
    def calc_corr(series_a, series_b):
        # 移除空值以防万一
        valid_idx = series_a.notna() & series_b.notna()
        if valid_idx.sum() < 2:
            return 0.0, 1.0 # 样本太少
        corr, p_value = spearmanr(series_a[valid_idx], series_b[valid_idx])
        return corr, p_value

    # ==========================================
    # 实验一：外部有效性与基准对比 (Main Result)
    # ==========================================
    # 比较对象：我们的 DTA vs 三个 Baselines
    # 验证目标：GT_Intensity (未来活跃强度) 和 GT_Trend_Slope (未来增长趋势)
    
    methods = {
        'Baseline: HAA (History Avg)': df['Base_HAA'],
        'Baseline: RM (Recent Momentum)': df['Base_RM'],
        'Baseline: LTE (Linear Trend)': df['Base_LTE'],
        'Ours: DTA (Standard)': df['DTA_Standard']
    }
    
    results_validity = []
    for name, series in methods.items():
        # 与 Intensity 的相关性
        corr_int, p_int = calc_corr(series, df['GT_Intensity_Mean'])
        # 与 Slope 的相关性
        corr_slope, p_slope = calc_corr(series, df['GT_Trend_Slope'])
        
        results_validity.append({
            'Method': name,
            'Corr w/ Future Intensity': corr_int,
            'P-val (Intensity)': p_int,
            'Corr w/ Future Slope': corr_slope,
            'P-val (Slope)': p_slope
        })
    
    df_validity = pd.DataFrame(results_validity)
    
    # ==========================================
    # 实验二：消融实验 (Ablation Study)
    # ==========================================
    # 验证目标：Sigmoid 和 权重的必要性
    
    ablations = {
        'Ours: DTA (Standard)': df['DTA_Standard'],
        'Variant: w/o Non-linearity': df['DTA_Linear'],
        'Variant: w/o Expert Weights': df['DTA_NoWeight']
    }
    
    results_ablation = []
    for name, series in ablations.items():
        corr_int, p_int = calc_corr(series, df['GT_Intensity_Mean'])
        corr_slope, p_slope = calc_corr(series, df['GT_Trend_Slope'])
        
        # 计算相对于标准版的性能下降百分比
        std_corr = df_validity[df_validity['Method'] == 'Ours: DTA (Standard)']['Corr w/ Future Intensity'].values[0]
        drop = (std_corr - corr_int) / std_corr * 100 if std_corr != 0 else 0
        
        results_ablation.append({
            'Variant': name,
            'Corr w/ Future Intensity': corr_int,
            'P-value': p_int,
            'Performance Drop': f"{drop:.1f}%" if drop > 0 else "-",
            'Corr w/ Future Slope': corr_slope,
            'P-value (Slope)': p_slope
        })
        
    df_ablation = pd.DataFrame(results_ablation)

    # ==========================================
    # 实验三：稳定性验证 (Stability Check)
    # ==========================================
    # 验证目标：DTA 是否偏好更稳定的仓库 (与 GT_Stability 正相关)
    
    corr_stab, p_stab = calc_corr(df['DTA_Standard'], df['GT_Stability_InvCV'])
    
    # ==========================================
    # 输出与保存
    # ==========================================
    
    print("\n" + "="*20 + " TABLE 1: 外部有效性与基准对比 " + "="*20)
    print(df_validity.round(4).to_string(index=False))
    
    print("\n" + "="*20 + " TABLE 2: 消融实验 (公式组件分析) " + "="*20)
    print(df_ablation.round(4).to_string(index=False))
    
    print("\n" + "="*20 + " 稳定性验证结论 " + "="*20)
    print(f"DTA Standard 与 Future Stability (1/CV) 的相关系数: {corr_stab:.4f} (P-value: {p_stab:.4e})")
    
    # 自动生成结论文本
    print("\n" + "="*20 + " 自动分析结论 (Copy to Paper) " + "="*20)
    best_baseline_corr = df_validity[df_validity['Method'].str.contains('Baseline')]['Corr w/ Future Intensity'].max()
    dta_corr = df_validity[df_validity['Method'] == 'Ours: DTA (Standard)']['Corr w/ Future Intensity'].values[0]
    
    if dta_corr > best_baseline_corr:
        print(f"[优越性] 本文提出的 DTA 指标在预测未来活跃强度方面表现最优 (rho={dta_corr:.3f})，")
        print(f"         显著优于最佳基准方法 ({best_baseline_corr:.3f})，相对提升了 {((dta_corr-best_baseline_corr)/best_baseline_corr*100):.1f}%。")
    else:
        print("[注意] DTA 指标未超过基准方法，请检查模型预测结果或数据分布。")
        
    if corr_stab > 0.3: # 经验阈值
        print(f"[稳定性] 此外，DTA 得分与仓库未来的稳定性呈现显著正相关 (rho={corr_stab:.3f})，")
        print("         表明该指标倾向于给予那些长期稳定发展的仓库更高的评分，而非短期爆发项目。")
        
    # 保存结果
    output_path = 'workresult/stacking/100dataset_DTA_verification/final_correlation_analysis.csv'
    
    # 将两张表合并保存，或者分sheet保存
    with open(output_path, 'w') as f:
        f.write("TABLE 1: Validity & Superiority\n")
        df_validity.to_csv(f, index=False)
        f.write("\nTABLE 2: Ablation Study\n")
        df_ablation.to_csv(f, index=False)
        f.write(f"\nStability Check\nCorrelation,{corr_stab}\nP-value,{p_stab}\n")
        
    print(f"\n详细统计表格已保存至: {output_path}")

else:
    print("无法进行分析，因为数据为空。")

成功加载数据，共 100 个有效仓库样本。

Corr w/ Future Intensity  Corr w/ Future Slope                          Method  P-val (Intensity)  P-val (Slope)
                  0.8715               -0.0447     Baseline: HAA (History Avg)                0.0         0.6588
                  0.8769               -0.1385  Baseline: RM (Recent Momentum)                0.0         0.1695
                  0.7675               -0.2291    Baseline: LTE (Linear Trend)                0.0         0.0219
                  0.9004               -0.0624            Ours: DTA (Standard)                0.0         0.5372

Corr w/ Future Intensity  Corr w/ Future Slope  P-value  P-value (Slope) Performance Drop                      Variant
                  0.9004               -0.0624      0.0           0.5372                -         Ours: DTA (Standard)
                  0.8904               -0.0667      0.0           0.5094             1.1%   Variant: w/o Non-linearity
                  0.9128               -0.0892      0.

In [8]:
# DTA变体与各目标变量的Spearman相关系数分析
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

# 读取CSV文件
file_path = 'workresult/stacking/100dataset_DTA_verification/experiment_master_data_chaoss.csv'
data = pd.read_csv(file_path)

# 定义要分析的变量组
dta_columns = ['DTA_Standard', 'DTA_Linear', 'DTA_NoWeight', 'DTA_Log', 'DTA_Tanh', 
               'Base_HAA', 'Base_RM', 'Base_LTE']
target_columns = ['CHAOSS_Review_Intensity', 'CHAOSS_Issue_Density', 'GT_Intensity_Mean', 
                  'GT_Trend_Slope', 'GT_Stability_InvCV']

# 创建结果存储DataFrame
results = []

# 计算所有组合的Spearman相关系数
for dta_col in dta_columns:
    for target_col in target_columns:
        # 移除NaN值
        mask = data[dta_col].notna() & data[target_col].notna()
        if mask.sum() > 2:  # 至少需要3个数据点才能计算相关系数
            corr, p_value = spearmanr(data.loc[mask, dta_col], data.loc[mask, target_col])
            results.append({
                'Metric': dta_col,
                'Target': target_col,
                'Correlation': corr,
                'P_Value': p_value
            })
        else:
            results.append({
                'Metric': dta_col,
                'Target': target_col,
                'Correlation': np.nan,
                'P_Value': np.nan
            })

# 转换为DataFrame
results_df = pd.DataFrame(results)

# 创建透视表格式以便查看
correlation_matrix = results_df.pivot(index='Metric', columns='Target', values='Correlation')
pvalue_matrix = results_df.pivot(index='Metric', columns='Target', values='P_Value')

# 打印相关系数表格
print("Spearman 相关系数矩阵:")
print(correlation_matrix.round(4))
print("\n" + "="*80 + "\n")

# 打印P值表格 (<0.05表示统计显著)
print("P值矩阵 (值<0.05表示统计显著):")
print(pvalue_matrix.round(4))
print("\n" + "="*80 + "\n")

# 创建带星号标记的综合表格 (*表示p<0.05，**表示p<0.01)
combined_table = correlation_matrix.copy()
for i in combined_table.index:
    for j in combined_table.columns:
        corr_val = correlation_matrix.loc[i, j]
        p_val = pvalue_matrix.loc[i, j]
        if np.isnan(corr_val):
            combined_table.loc[i, j] = "N/A"
        elif p_val < 0.01:
            combined_table.loc[i, j] = f"{corr_val:.4f}**"
        elif p_val < 0.05:
            combined_table.loc[i, j] = f"{corr_val:.4f}*"
        else:
            combined_table.loc[i, j] = f"{corr_val:.4f}"

print("综合结果表格 (相关系数 + 显著性标记):")
print("(注: *表示p<0.05, **表示p<0.01)")
print(combined_table)
print("\n" + "="*80 + "\n")

# 保存结果到CSV文件
output_dir = 'workresult/stacking/100dataset_DTA_verification'

# 保存详细结果
detailed_results = results_df.copy()
detailed_results['Significant_0.05'] = detailed_results['P_Value'] < 0.05
detailed_results['Significant_0.01'] = detailed_results['P_Value'] < 0.01
detailed_results = detailed_results.sort_values(['Metric', 'Target'])

# 保存相关系数矩阵和综合表格为CSV文件
detailed_results.to_csv(f'{output_dir}/correlation_analysis_detailed.csv', index=False)
correlation_matrix.to_csv(f'{output_dir}/correlation_matrix.csv')
pvalue_matrix.to_csv(f'{output_dir}/pvalue_matrix.csv')

# 创建一个格式化的综合表格并保存为CSV
formatted_summary = pd.DataFrame(index=correlation_matrix.index, columns=correlation_matrix.columns)
for i in correlation_matrix.index:
    for j in correlation_matrix.columns:
        corr_val = correlation_matrix.loc[i, j]
        p_val = pvalue_matrix.loc[i, j]
        if np.isnan(corr_val):
            formatted_summary.loc[i, j] = "N/A"
        elif p_val < 0.01:
            formatted_summary.loc[i, j] = f"{corr_val:.4f}**"
        elif p_val < 0.05:
            formatted_summary.loc[i, j] = f"{corr_val:.4f}*"
        else:
            formatted_summary.loc[i, j] = f"{corr_val:.4f}"
formatted_summary.to_csv(f'{output_dir}/formatted_summary.csv')

print(f"详细统计结果已保存至:")
print(f"  - {output_dir}/correlation_analysis_detailed.csv")
print(f"  - {output_dir}/correlation_matrix.csv")
print(f"  - {output_dir}/pvalue_matrix.csv")
print(f"  - {output_dir}/formatted_summary.csv")

# 输出主要发现
print("\n" + "="*80)
print("主要发现摘要:")
print("="*80)

# 找出与GT_Intensity_Mean相关性最高的指标
intensity_corr = correlation_matrix['GT_Intensity_Mean'].abs().sort_values(ascending=False)
top_intensity_metric = intensity_corr.index[0]
top_intensity_corr = correlation_matrix.loc[top_intensity_metric, 'GT_Intensity_Mean']
print(f"与未来活跃强度(GT_Intensity_Mean)相关性最高的指标: {top_intensity_metric} (ρ={top_intensity_corr:.4f})")

# 找出与GT_Stability_InvCV相关性最高的指标
stability_corr = correlation_matrix['GT_Stability_InvCV'].abs().sort_values(ascending=False)
top_stability_metric = stability_corr.index[0]
top_stability_corr = correlation_matrix.loc[top_stability_metric, 'GT_Stability_InvCV']
print(f"与稳定性(GT_Stability_InvCV)相关性最高的指标: {top_stability_metric} (ρ={top_stability_corr:.4f})")

# 检查DTA_Standard的表现
dta_standard_row = correlation_matrix.loc['DTA_Standard']
print(f"\n本文提出的方法(DTA_Standard)表现:")
for col in target_columns:
    print(f"  与{col}: ρ={dta_standard_row[col]:.4f}")

Spearman 相关系数矩阵:
Target        CHAOSS_Issue_Density  CHAOSS_Review_Intensity  \
Metric                                                        
Base_HAA                   -0.0246                   0.3197   
Base_LTE                   -0.0241                   0.2803   
Base_RM                    -0.0419                   0.3266   
DTA_Linear                 -0.0958                   0.2969   
DTA_Log                    -0.0906                   0.3028   
DTA_NoWeight               -0.0370                   0.3378   
DTA_Standard               -0.0831                   0.2928   
DTA_Tanh                   -0.0781                   0.2866   

Target        GT_Intensity_Mean  GT_Stability_InvCV  GT_Trend_Slope  
Metric                                                               
Base_HAA                 0.8715              0.8063         -0.0447  
Base_LTE                 0.7675              0.7174         -0.2291  
Base_RM                  0.8769              0.8134         -0.1385  
DT