In [None]:
# 著名仓库对比数据提取 //大型仓库数据量少，在算spearman相关系数的时候无法有差异化结果，所以暂不以其作为表格结果
# 该文件主要是针对openrank比赛需要的代码
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LinearRegression

# ================= 配置区域 =================
repoName = ['vue', 'react-naive']

# 权重定义
ISSUECOMMENT_WEIGHT = 0.5252
OPENISSUE_WEIGHT = 2.2235
OPENPR_WEIGHT = 4.0679
PRREVIEWCOMMENT_WEIGHT = 0.7427
MERGEPR_WEIGHT = 2.0339
WEIGHT = np.array([ISSUECOMMENT_WEIGHT, OPENISSUE_WEIGHT, OPENPR_WEIGHT, PRREVIEWCOMMENT_WEIGHT, MERGEPR_WEIGHT])

# 常量定义
PRED_LEN = 84
SEQ_LEN = 84  # 历史回溯长度
WEIGHT_SUM = np.sum(WEIGHT) # 约为 9.5932
MIN_SCORE_STD = WEIGHT_SUM * 0.5 # Sigmoid(0)=0.5, 4.7966
MAX_SCORE_STD = WEIGHT_SUM * 1.0 # Sigmoid(inf)=1.0, 9.5932

# ================= 工具函数 =================
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def calculate_lte(history_data):
    """
    计算线性趋势外推 (Linear Trend Extrapolation)
    :param history_data: shape (84, 5)
    :return: 预测未来84步的均值
    """
    n_steps, n_features = history_data.shape
    X = np.arange(n_steps).reshape(-1, 1) # [0, 1, ..., 83]
    X_future = np.arange(n_steps, n_steps * 2).reshape(-1, 1) # [84, ..., 167]
    
    predictions = []
    model = LinearRegression()
    
    # 对5个特征分别拟合
    for i in range(n_features):
        y = history_data.iloc[:, i].values
        model.fit(X, y)
        pred_future = model.predict(X_future)
        predictions.append(pred_future.mean())
        
    return np.mean(predictions)

# ================= 主逻辑 =================
results = []

print(f"{'Repository':<15} | {'Status':<10} | {'Info'}")
print("-" * 50)

for name in repoName:
    # 1. 构建文件路径
    # 注意：这里假设你的 dataset 文件夹结构为 dataset/{name}/{name}_all_roll_ewma_span28_normalize.csv
    csv_path = f'dataset/{name}/{name}_to2017end.csv'
    pred_path = f'results/stacking_attention_fusion_PatchTST_Dlinear_custom_{name}_ftM_sl84_ll84_pl84_dm512_nh8_el2_dl1_df2048_expand2_dc4_fc3_ebtimeF_dtTrue_Exp_0/pred.npy'

    # 2. 检查文件是否存在
    if not os.path.exists(pred_path):
        print(f"{name:<15} | MISSING    | Pred file not found, skipping...")
        continue
    
    if not os.path.exists(csv_path):
        print(f"{name:<15} | MISSING    | CSV file not found, skipping...")
        continue

    try:
        # 3. 加载数据
        pred = np.load(pred_path)
        # 提取第一个测试样本的预测值 (假设 pred shape 为 [Batch, 84, 5])
        # 如果只想分析特定样本，这里索引保持为 0
        pred_data = pred[0, :PRED_LEN, :5] 

        df = pd.read_csv(csv_path)
        if 'date' in df.columns:
            df = df.drop(columns=['date'])

        # 4. 确定时间窗口索引
        dataset_len = len(df)
        num_test = int(dataset_len * 0.2)
        
        # 预测窗口 (Future / Ground Truth)
        start_index = dataset_len - num_test
        end_index = start_index + PRED_LEN
        
        # 历史窗口 (History / Baseline Input)
        hist_start = start_index - SEQ_LEN
        hist_end = start_index

        # 边界检查
        if hist_start < 0:
            print(f"{name:<15} | WARNING    | History window out of bounds, skipping...")
            continue

        # 提取序列
        true_data = df.iloc[start_index:end_index] # 未来真实值
        hist_data = df.iloc[hist_start:hist_end]   # 历史输入值

        # ================= 计算指标 =================
        
        # --- A. 我们的指标 (Ours) ---
        
        # 1. DTA Standard (标准版)
        # 逻辑: Mean -> Sigmoid -> Weighted Sum -> Scale
        mean_values = np.mean(pred_data, axis=0)
        sigmoid_values = sigmoid(mean_values)
        weighted_sum_std = np.dot(sigmoid_values, WEIGHT)
        # 放缩公式: (x - min) / min * 100  (你的代码逻辑)
        # 注意: 4.7966 是 min_score (sum(w)*0.5)
        dta_standard = ((weighted_sum_std - MIN_SCORE_STD) / MIN_SCORE_STD) * 100

        # --- B. 外部真理 (Ground Truth) ---
        
        # 1. GT Intensity (FASI - 强度)
        # 定义为未来真实值的均值 (反映活跃水平)
        gt_intensity = true_data.values.mean()
        
        # 2. GT Slope (你代码中的 FASI，反映增长趋势)
        # 保留你的原始逻辑作为参考
        total_changes = true_data.iloc[-1] - true_data.iloc[0]
        gt_slope = (total_changes / (len(true_data) - 1)).mean()

        # 3. GT Stability (1/CV)
        # 变异系数倒数. 加上 1e-6 防止除零
        means = true_data.mean()
        stds = true_data.std()
        cvs = stds / (means + 1e-6)
        # 取所有特征 CV 的平均值的倒数，或者平均稳定性
        # 这里计算平均 CV 然后取倒数
        avg_cv = cvs.mean()
        gt_stability = 1 / (avg_cv + 1e-6)

        total_reviews = true_data.iloc[:, 3].sum()  # ReviewComment 是第4列(索引3)
        total_merges = true_data.iloc[:, 4].sum()   # MergePR 是第5列(索引4)
        chaoss_review_intensity = total_reviews / (total_merges + 1e-9)  # 添加小值防止除零

        # ================= 存储结果 =================
        result_dict = {
            'Repo_Name': name,
            # Ours
            'DTA_Standard': dta_standard,
            # Ground Truths
            'GT_Intensity_Mean': gt_intensity,
            'GT_Trend_Slope': gt_slope,
            'GT_Stability_InvCV': gt_stability,
            'CHAOSS_Review_Intensity': chaoss_review_intensity
        }
        
        results.append(result_dict)
        print(f"{name:<15} | SUCCESS    | DTA: {dta_standard:.2f}")

    except Exception as e:
        print(f"{name:<15} | ERROR      | {str(e)}")

# ================= 保存文件 =================
if results:
    output_dir = 'workresult/stacking/openrank_DTA'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    results_df = pd.DataFrame(results)
    
    # 按 DTA 分数排序，方便观察
    results_df = results_df.sort_values(by='DTA_Standard', ascending=False)
    
    # 调整列顺序
    cols = ['Repo_Name', 'DTA_Standard',
            'GT_Intensity_Mean', 'GT_Trend_Slope', 'GT_Stability_InvCV', 'CHAOSS_Review_Intensity']
    results_df = results_df[cols]
    
    save_path = os.path.join(output_dir, 'experiment_master_data.csv')
    results_df.to_csv(save_path, index=False)
    print("\n" + "="*50)
    print(f"所有数据处理完成！结果已保存至: {save_path}")
    print(f"成功处理仓库数量: {len(results_df)}")
else:
    print("\n未生成任何结果，请检查文件路径配置。")

Repository      | Status     | Info
--------------------------------------------------
vue             | SUCCESS    | DTA: 95.71
react-naive     | SUCCESS    | DTA: 62.79

所有数据处理完成！结果已保存至: workresult/stacking/openrank_DTA/experiment_master_data.csv
成功处理仓库数量: 2
