In [1]:
# 读取CSV文件
import pandas as pd

# 读取CSV文件
df = pd.read_csv('chain_similarity_results.csv')

# 按index排序
df = df.sort_values('Index')

# 保存排序后的结果
df.to_csv('chain_similarity_results.csv', index=False)

# 对相同Index的行,保留final_score最高的一条
df = df.sort_values('Static Similarity', key=lambda x: x.apply(lambda y: eval(y)['final_score'] if isinstance(y, str) else 0), ascending=False)
df = df.drop_duplicates(subset=['Index'], keep='first')

# 重新按Index排序
df = df.sort_values('Index')

# 保存结果
df.to_csv('chain_similarity_results.csv', index=False)


In [2]:
print(df["Index"].unique())

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210]


In [3]:
import os

# 获取所有存在的index
existing_indices = set(df['Index'].unique())

# 遍历cfg目录下的所有文件
cfg_files = [f for f in os.listdir('cfg') if f.startswith('cfg_') and f.endswith('.json')]

for cfg_file in cfg_files:
    # 从文件名提取index
    try:
        index = int(cfg_file.replace('cfg_','').replace('.json',''))
        # 如果index不在DataFrame中,删除该文件
        if index not in existing_indices:
            # os.remove(os.path.join('cfg', cfg_file))
            print(f"删除了不存在index的文件: {cfg_file}")
    except ValueError:
        print(f"无法从文件名解析index: {cfg_file}")


## 清掉数据差的

In [4]:
import os

# 提取Static Similarity和LLM Similarity列
static_similarities = df['Static Similarity'].apply(lambda x: round(eval(x)['final_score'], 1) if isinstance(x, str) else x)

low_static = df[static_similarities < 60]
for idx, row in low_static.iterrows():
    # 删除cfg/下的json文件
    index = row['Index']
    cfg_file = f'cfg/{index}.json'
    # 从DataFrame中删除对应的行
    #df = df[df['Index'] != index]
    # 将更新后的DataFrame写回文件
    #df.to_csv('chain_similarity_results.csv', index=False)
    print(cfg_file)
    print(row['Static Similarity'])
    if os.path.exists(cfg_file):
        #os.remove(cfg_file)
        print(f"删除 {cfg_file}")

In [4]:
## 重新计算LLM相似度
from multiprocessing import cpu_count
from llm import get_llm_answers

def compare_code_similarity(original_code: str, code: str):
    """使用LLM比较代码相似度"""
    # print("Starting LLM Code Similarity Comparison")
    
    prompt = """Please compare the similarity between these two code snippets and provide:
1. A similarity score from 0-100
2. A detailed analysis explaining the score

Original code:
""" + original_code + """

Generated code:
""" + code + """

Your response must be in JSON format like this:
{
    "score": 80, 
    "analysis": ""
}

The score is based on the following criteria:
- Implementation details
- Code structure and organization  
- Variable naming and coding style
- Error handling and edge cases
- Overall functionality and behavior
"""
    response = get_llm_answers(prompt, model_name="gpt-4o-2024-11-20", require_json=True)
    # print(f"Similarity Comparison Result:\n{response}")
    return response

from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def process_row(args):
    index, row = args
    original_code = row['Original Code']
    code = row['Generated Code'] 
    print(f"Processing row {index}")
    response = compare_code_similarity(original_code, code)
    return index, response

# 使用线程池并行处理
with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
    # 创建任务列表
    tasks = list(df.iterrows())
    
    # 使用tqdm显示进度条
    results = list(tqdm(executor.map(process_row, tasks), total=len(tasks)))
    
    # 更新DataFrame
    for index, response in results:
        df.at[index, 'LLM Similarity'] = response

df.to_csv('similarity_results.csv', index=False)

Processing row 6
Processing row 93
Processing row 88
Processing row 168
Processing row 102
Processing row 163
Processing row 84
Processing row 26
Processing row 29
Processing row 38
Processing row 187
Processing row 2
Processing row 15
Processing row 12
Processing row 70
Processing row 35
Processing row 58
Processing row 66
Processing row 0
Processing row 73
Processing row 120
Processing row 19
Processing row 8
Processing row 3
Processing row 14
Processing row 110
Processing row 65
Processing row 193
Processing row 144
Processing row 172
Processing row 21
Processing row 96
Processing row 76
Processing row 60
Processing row 20
Processing row 179
Processing row 43
Processing row 9
Processing row 86
Processing row 77
Processing row 183
Processing row 94
Processing row 114
Processing row 194
Processing row 87
Processing row 46
Processing row 181
Processing row 104
Processing row 105
Processing row 132
Processing row 27
Processing row 57
Processing row 180
Processing row 198
Processing row 

  0%|          | 1/200 [00:04<16:09,  4.87s/it]

Processing row 32
Processing row 61
Processing row 74
Processing row 169
Processing row 34
Processing row 55
Processing row 42
Processing row 56
Processing row 37
Processing row 71
Processing row 49
Processing row 196Processing row 166

Processing row 136
Processing row 101
Processing row 154
Processing row 178
Processing row 82
Processing row 63
Processing row 148
Processing row 128


  1%|          | 2/200 [00:21<39:34, 11.99s/it]

Processing row 75
Processing row 176
Processing row 92
Processing row 52
Processing row 103
Processing row 145
Processing row 80
Processing row 68
Processing row 137
Processing row 141
Processing row 67Processing row 165

Processing row 151
Processing row 149
Processing row 189
Processing row 171
Processing row 118
Processing row 162


  2%|▏         | 3/200 [00:27<29:58,  9.13s/it]

Processing row 62
Processing row 150
Processing row 130
Processing row 48
Processing row 161
Processing row 139
Processing row 109
Processing row 184
Processing row 158
Processing row 115
Processing row 186
Processing row 111
Processing row 116
Processing row 143
Processing row 97
Processing row 138
Processing row 142
Processing row 106
Processing row 174
Processing row 155
Processing row 152
Processing row 197
Processing row 167
Processing row 146
Processing row 99
Processing row 157
Processing row 190
Processing row 164
Processing row 156
Processing row 95


  2%|▏         | 4/200 [00:36<29:26,  9.01s/it]

Processing row 131Processing row 173
Processing row 133



  2%|▎         | 5/200 [00:37<20:13,  6.22s/it]

Processing row 117
Processing row 147
Processing row 159
Processing row 170
Processing row 119
Processing row 126
Processing row 140
Processing row 153
Processing row 127
Processing row 123
Processing row 160
Processing row 182
Processing row 175
Processing row 185
Processing row 177
Processing row 121


100%|██████████| 200/200 [01:04<00:00,  3.11it/s]
