In [2]:
import pandas as pd

# 读取第一个CSV文件
df1 = pd.read_csv('workresult/stacking/big_repo_DTA_verification/experiment_master_data.csv')

# 提取repo_name和DTA_Standard列，并重命名DTA_Standard为score
result_df1 = df1[['Repo_Name', 'DTA_Standard']].copy()
result_df1.rename(columns={'DTA_Standard': 'score'}, inplace=True)

# 读取第二个CSV文件
df2 = pd.read_csv('workresult/stacking/100dataset_DTA_verification/experiment_master_data.csv')

# 提取repo_name和DTA_Standard列，并重命名DTA_Standard为score
result_df2 = df2[['Repo_Name', 'DTA_Standard']].copy()
result_df2.rename(columns={'DTA_Standard': 'score'}, inplace=True)

# 合并两个DataFrame
combined_df = pd.concat([result_df1, result_df2], ignore_index=True)

# 定义分档函数
def get_score_description(score):
    if 0 <= score < 40:
        return '活跃度较低，仓库当前活跃度相较历史表现偏低，开发与社区活动不够频繁，维护状态有待提升。'
    elif 40 <= score < 60:
        return '活跃度约处于历史最佳表现的中间水平，存在一定开发或互动行为，但整体活跃程度有限。'
    elif 60 <= score < 80:
        return '仓库活跃度处于中等水平，具备较稳定的维护与社区互动，有持续但不密集的开发活动。'
    elif 80 <= score < 90:
        return '仓库整体活跃状态良好，开发节奏稳定，社区互动较为活跃。维护及时，issue / 讨论 / 提交较为频繁。'
    elif 90 <= score <= 100:
        return '仓库活跃度高，开发与社区参与持续活跃，整体运行状态健康。具备良好的持续发展能力。'
    else:
        return '分数超出范围'

# 添加score_description列
combined_df['score_description'] = combined_df['score'].apply(get_score_description)

# 按照分数从高到低排序
combined_df = combined_df.sort_values(by='score', ascending=False)

# 将结果保存到新文件
combined_df.to_csv('Frontend-system/data/processed_repo_data.csv', index=False)

# 显示结果
print(combined_df)

                                             Repo_Name      score  \
5                                                ImHex  97.992507   
6                                              airflow  96.808508   
0                                         transformers  96.531745   
7                                             TDengine  96.411442   
8                                                 node  96.140598   
9                                           localstack  95.866535   
10                                               gitea  95.345547   
11                                                grpc  95.049114   
1                                               vscode  92.288880   
12                                           PaddleOCR  92.229429   
13                                             next.js  92.149088   
14                                           openpilot  91.769781   
15                                     ionic-framework  91.723296   
16                                

In [None]:
# 找数据的npy文件组织成echart需要的json文件
import os
import pandas as pd
import numpy as np
import json
from datetime import datetime

folder_path = f"dataset/random100_from_21-23-dataset"
PRED_LEN = 84
SEQ_LEN = 84  # 历史回溯长度

# 读取刚刚生成的文件
df = pd.read_csv('Frontend-system/data/processed_repo_data.csv')

# 提取Repo_Name列作为一个列表
repoName = df['Repo_Name'].tolist()

# 创建输出目录
output_dir = 'Frontend-system/data/echart_data'
os.makedirs(output_dir, exist_ok=True)

# 指标名称列表
metric_names = ['IssueComment', 'openIssue', 'openPR', 'ReviewComment', 'mergePR']

# 特殊处理的仓库列表
special_repos = ['transformers', 'vscode', 'kubernetes', 'react-naive', 'vue']

for name in repoName:
    # --- 文件路径构建 ---
    csv_path = f'{folder_path}/{name}_all_roll_ewma_span28_normalize.csv'
    
    # 针对特定仓库使用不同的pred_path格式
    if name in special_repos:
        pred_path = f'results/stacking_attention_fusion_PatchTST_Dlinear_custom_{name}_ftM_sl84_ll84_pl84_dm512_nh8_el2_dl1_df2048_expand2_dc4_fc3_ebtimeF_dtTrue_Exp_0/pred.npy'
    else:
        pred_path = f'results/stacking_attention_fusion_PatchTST_DLinear_custom_{name}_ftM_sl84_ll84_pl84_dm512_nh8_el2_dl1_df2048_expand2_dc4_fc3_ebtimeF_dtTrue_Exp_0/pred.npy'

    # --- 检查文件 ---
    if not os.path.exists(pred_path):
        print(f"{name:<20} | MISSING    | Prediction file not found at {pred_path}")
        continue
    if not os.path.exists(csv_path):
        print(f"{name:<20} | MISSING    | CSV file not found")
        continue
    
    try:
        # --- 加载数据 ---
        pred = np.load(pred_path)
        pred_data = pred[0, :PRED_LEN, :5] # 模型预测出的未来数据 (用于计算 DTA)

        df_original = pd.read_csv(csv_path)
        if 'date' in df_original.columns:
            dates = df_original['date'].tolist()
            df = df_original.drop(columns=['date'])
        else:
            # 如果没有日期列，创建模拟日期
            dataset_len = len(df_original)
            dates = [f"2023-{(i%12)+1:02d}-{(i%28)+1:02d}" for i in range(dataset_len)]

        # --- 确定时间窗口 ---
        dataset_len = len(df)
        num_test = int(dataset_len * 0.2)
        
        start_index = dataset_len - num_test
        end_index = start_index + PRED_LEN
        hist_start = start_index - SEQ_LEN
        hist_end = start_index

        if hist_start < 0:
            continue

        true_data = df.iloc[start_index:end_index] # 未来真实值 (Ground Truth & CHAOSS)
        hist_data = df.iloc[hist_start:hist_end]   # 历史输入值 (Baselines)
        
        # 获取对应的时间戳
        hist_dates = dates[hist_start:hist_end]
        true_dates = dates[start_index:end_index]
        
        # 为预测数据创建后续日期（假设按天递增）
        last_date_str = true_dates[-1]
        # 处理日期字符串，如果格式不正确则使用默认值
        try:
            last_date = datetime.strptime(last_date_str, "%Y-%m-%d")
        except ValueError:
            # 如果日期格式不正确，使用默认日期
            last_date = datetime(2023, 12, 31)
        
        pred_dates = []
        for i in range(1, PRED_LEN + 1):
            next_date = last_date + pd.Timedelta(days=i)
            pred_dates.append(next_date.strftime("%Y-%m-%d"))
        
        # 为每个指标创建单独的JSON文件
        for col_idx, metric_name in enumerate(metric_names):
            # 创建每个仓库的子目录
            repo_output_dir = os.path.join(output_dir, name)
            os.makedirs(repo_output_dir, exist_ok=True)
            
            # 准备该指标的数据
            metric_data = {
                "repo_name": name,
                "metric_name": metric_name,
                "history": [],  # 历史数据
                "actual": [],   # 真实数据
                "prediction": [] # 预测数据
            }
            
            # 添加历史数据
            for i, date in enumerate(hist_dates):
                metric_data["history"].append({
                    "date": date,
                    "value": float(hist_data.iloc[i, col_idx])
                })
            
            # 添加真实数据
            for i, date in enumerate(true_dates):
                metric_data["actual"].append({
                    "date": date,
                    "value": float(true_data.iloc[i, col_idx])
                })
            
            # 添加预测数据
            for i, date in enumerate(pred_dates):
                metric_data["prediction"].append({
                    "date": date,
                    "value": float(pred_data[i, col_idx])
                })
            
            # 保存该指标的JSON文件
            output_file = os.path.join(repo_output_dir, f"{metric_name}_data.json")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(metric_data, f, ensure_ascii=False, indent=2)
        
        print(f"{name:<20} | SUCCESS    | All metric JSON files saved to {repo_output_dir}")
        
    except Exception as e:
        print(f"{name:<20} | ERROR      | {str(e)}")

print("数据处理完成，所有JSON文件已保存到", output_dir)

ImHex                | SUCCESS    | All metric JSON files saved to Frontend-system/data/echart_data/ImHex
airflow              | SUCCESS    | All metric JSON files saved to Frontend-system/data/echart_data/airflow
transformers         | SUCCESS    | All metric JSON files saved to Frontend-system/data/echart_data/transformers
TDengine             | SUCCESS    | All metric JSON files saved to Frontend-system/data/echart_data/TDengine
node                 | SUCCESS    | All metric JSON files saved to Frontend-system/data/echart_data/node
localstack           | SUCCESS    | All metric JSON files saved to Frontend-system/data/echart_data/localstack
gitea                | SUCCESS    | All metric JSON files saved to Frontend-system/data/echart_data/gitea
grpc                 | SUCCESS    | All metric JSON files saved to Frontend-system/data/echart_data/grpc
vscode               | SUCCESS    | All metric JSON files saved to Frontend-system/data/echart_data/vscode
PaddleOCR            | SUCCESS