In [None]:
%pip install openpyxl

In [None]:
import json,os
from collections import Counter, defaultdict
from openpyxl import Workbook

In [2]:
projects = {
    "dubbo": "C:\\java tool\\Apache\\dubbo",
    "cloudstack": "C:\\java tool\\Apache\\cloudstack",
    "druid": "C:\\java tool\\Apache\\druid",
    "kiota-java": "C:\\java tool\\Microsoft\\kiota-java",
    "spring-integration": "C:\\java tool\\Spring\\spring-integration",
    "spring-security": "C:\\java tool\\Spring\\spring-security",
}

In [None]:
data = dict()
data['all'] = list()
raw_data = dict()
for project, path in projects.items():

    with open(f"raw data\\{project}.json", 'r', errors='replace') as file:
        result = json.load(file)
    raw_data[project] = result
    data[project] = [s['mockPattern'].strip() for s in result]
    data['all'].extend([s['mockPattern'].strip() for s in result])
    
target = data['all']
pattern_counter = Counter(target)
pattern_counts=dict(sorted(
        pattern_counter.items(),
        key=lambda x: x[1],
        reverse=True
    ))
print(f"there are {len(pattern_counts)} patterns in total")
print("#" * 30)
# 格式化输出 format
pattern_ids = dict()
Id=1
sums=0
for pattern, count in pattern_counts.items():
    sums+=count*100/len(target)
    # print(f"{pattern_ids[pattern]}\t{count}\t{round(sums)}%")
    print(f"Pattern #{Id}\n appears {count} ({round(count*100/len(target))}%) times :\n")
    print(f"{pattern}")        
    print("─" * 30)
    pattern_ids[pattern] = Id
    Id+=1

there are 77 patterns in total
##############################
Pattern #1
 appears 2866 (25%) times :

Creation:
— Local Mock Creation in Test Case

Stubbing:
— Test Case

Verification:
— None
──────────────────────────────
Pattern #2
 appears 2495 (22%) times :

Creation:
— Local Mock Creation in Test Case

Stubbing:
— None

Verification:
— None
──────────────────────────────
Pattern #3
 appears 2107 (19%) times :

Creation:
— Attribute Mock Creation

Stubbing:
— None

Verification:
— None
──────────────────────────────
Pattern #4
 appears 942 (8%) times :

Creation:
— Attribute Mock Creation

Stubbing:
— Test Case

Verification:
— None
──────────────────────────────
Pattern #5
 appears 668 (6%) times :

Creation:
— Local Mock Creation in Test Case

Stubbing:
— None

Verification:
— Test Case
──────────────────────────────
Pattern #6
 appears 548 (5%) times :

Creation:
— Local Mock Creation in Test Case

Stubbing:
— Test Case

Verification:
— Test Case
──────────────────────────────
P

In [None]:
# Configuration path
OUTPUT_DIR = "mock_pattern_output"

def sanitize(name):
    # Make sure the file name is valid
    return name.replace('<', '_').replace('>', '_').replace(':', '_').replace('?', '-')

def process_data(raw_data):
    examples = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    global_stats = defaultdict(lambda: defaultdict(int))

    for project, entries in raw_data.items():
        type_counter = defaultdict(int)
        type_pattern_counter = defaultdict(lambda: defaultdict(int))
        
        for entry in entries:
            vtype = entry['variableType']
            pattern_id = pattern_ids[entry['mockPattern']]  # Get the actual patternID from the entry
            
            # The pattern key is generated 
            pattern_key = f"pattern {pattern_id}" if 0 < pattern_id <= 15 else "other"
            entry['pattern_id'] = pattern_id
            
            examples[project][vtype][pattern_key].append(entry)
            type_counter[vtype] += 1
            type_pattern_counter[vtype][pattern_key] += 1
            global_stats[project][pattern_key] += 1

        # create project directory
        project_dir = os.path.join(OUTPUT_DIR, sanitize(project))
        os.makedirs(project_dir, exist_ok=True)

        # sort variableType by count
        sorted_types = sorted(type_counter.items(), 
                            key=lambda x: (-x[1], x[0]),
                            reverse=False)

        # deal with each variableType
        for rank, (vtype, type_count) in enumerate(sorted_types, start=1):
            type_dir = os.path.join(
                project_dir,
                f"{rank}-{sanitize(vtype)}-{type_count}"
            )
            os.makedirs(type_dir, exist_ok=True)

            # 获取该类型的pattern分布 get the pattern distribution of this type
            patterns = type_pattern_counter[vtype]
            
            # 排序patterns（按出现次数）sort patterns by count
            sorted_patterns = sorted(patterns.items(),
                                   key=lambda x: -x[1])

            # 生成pattern文件 generate pattern files
            for p_rank, (p_key, p_count) in enumerate(sorted_patterns, start=1):
                # 动态生成文件名 generate file name
                if p_key == "other":
                    fname = f"{p_rank}-other-{p_count}.json"
                else:
                    fname = f"{p_rank}-{p_key}-{p_count}.json"
                
                # 写入实际数据 write actual data
                with open(os.path.join(type_dir, fname), 'w') as f:
                    json.dump(examples[project][vtype][p_key], f, indent=2)

        # 生成项目级Excel报表 generate project-level Excel report
        wb = Workbook()
        ws = wb.active
        ws.title = f"{project}_Summary"
        
        # 表头（动态生成） headers (dynamic generation)
        headers = ["Dependency", "Total"] + [f"Pattern{i}" for i in range(1,16)] + ["Other"]
        ws.append(headers)
        
        # 数据行 data rows
        for dependency in type_pattern_counter:
            total = sum(type_pattern_counter[dependency].values())
            row = [dependency, total]
            row += [type_pattern_counter[dependency].get(f"pattern {i}", 0) for i in range(1,16)]
            row.append(type_pattern_counter[dependency].get("other", 0))
            ws.append(row)
        
        # 总计行 total row
        total_row = ["TOTAL"]
        total_row.append(sum(ws.cell(row=i, column=2).value for i in range(2, ws.max_row+1)))
        for col in range(3, 18):
            total_row.append(sum(ws.cell(row=i, column=col).value for i in range(2, ws.max_row+1)))
        ws.append(total_row)
        
        wb.save(os.path.join(OUTPUT_DIR, project, f"{project}_summary.xlsx"))



    # 生成Excel报表 generate Excel report
    wb = Workbook()
    ws = wb.active
    ws.title = "Summary"
    
    # 表头（动态生成） headers (dynamic generation)
    headers = ["Project", "Total"] + [f"Pattern{i}" for i in range(1,16)] + ["Other"]
    ws.append(headers)
    
    # 数据行    data rows
    for project in global_stats:
        total = sum(global_stats[project].values())
        row = [project, total]
        row += [global_stats[project].get(f"pattern {i}", 0) for i in range(1,16)]
        row.append(global_stats[project].get("other", 0))
        ws.append(row)
    
    # 总计行 total row
    total_row = ["TOTAL"]
    total_row.append(sum(ws.cell(row=i, column=2).value for i in range(2, ws.max_row+1)))
    for col in range(3, 18):
        total_row.append(sum(ws.cell(row=i, column=col).value for i in range(2, ws.max_row+1)))
    ws.append(total_row)
    
    wb.save(os.path.join(OUTPUT_DIR, "global_summary.xlsx"))

# 使用示例 use example
if __name__ == "__main__":
    # 示例数据格式（需替换为实际数据） example data format (replace with actual data)

    
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    process_data(raw_data)
