In [2]:
!python ../src/ConvertToJsonlMulti.py ../data/taxonomy.jsonl ../data/label_taxonomy.jsonl

Processed 1 input file(s); wrote 32 object(s) → ../data/label_taxonomy.jsonl


In [2]:
import json

# JSONLファイルのパス
jsonl_path = "../data/label.jsonl"

# 読み込み
records = []
with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue  # 空行はスキップ
        try:
            obj = json.loads(line)
            records.append(obj)
        except json.JSONDecodeError as e:
            print(f"JSONデコードエラー: {e}")
            print(f"問題の行: {line}")

# 確認
print(f"読み込んだ件数: {len(records)}")
print("1件目のデータ:")
print(json.dumps(records[0], indent=2, ensure_ascii=False))

# 例: CWE-IDごとの件数を集計
from collections import Counter
cwe_counts = Counter([rec.get("TARGET-CWE-ID") for rec in records])
print("\nTARGET-CWE-IDごとの件数:")
for cwe, count in cwe_counts.items():
    print(f"{cwe}: {count}")

読み込んだ件数: 150
1件目のデータ:
{
  "TARGET-CWE-ID": "CWE-416",
  "FLAWS": "Use After Free",
  "TEST-ID": "501114",
  "LANGUAGE": "C",
  "STATE": "bad",
  "STATUS": "accepted",
  "OCCURRENCES": [
    {
      "PATH": "../data/wireshark-1.2-buggy/epan/dissectors/packet-http.c",
      "TRIGGER_LINE": [
        2256
      ],
      "CWE-ID": "CWE-416",
      "FLAWS": "Use After Free"
    },
    {
      "PATH": "../data/wireshark-1.2-buggy/epan/dissectors/packet-http.c",
      "TRIGGER_LINE": [
        2058
      ],
      "CWE-ID": "CWE-416",
      "FLAWS": "Use After Free"
    },
    {
      "PATH": "../data/wireshark-1.2-buggy/epan/range.c",
      "TRIGGER_LINE": [
        286
      ],
      "CWE-ID": "CWE-416",
      "FLAWS": "Use After Free"
    }
  ]
}

TARGET-CWE-IDごとの件数:
CWE-416: 30
CWE-119: 30
CWE-476: 30
CWE-190: 30
CWE-78: 30


In [3]:
import json
from pathlib import Path

# ==== 設定 ====
base_dir = "."                      # 相対パスの基準ディレクトリ

base_dir = Path(base_dir)
total = 0
ok_count = 0
ng_count = 0
missing_paths = []

with open(jsonl_path, "r", encoding="utf-8") as f:
    for lineno, line in enumerate(f, 1):
        s = line.strip().lstrip("\ufeff")
        if not s or s.startswith("//"):
            continue
        try:
            rec = json.loads(s)
        except json.JSONDecodeError as e:
            print(f"[WARN] JSON parse error at line {lineno}: {e}")
            continue

        test_id = rec.get("TEST-ID")
        cwe_id = rec.get("TARGET-CWE-ID")

        occs = rec.get("OCCURRENCES", [])
        for occ in occs:
            path = occ.get("PATH")
            if not path:
                continue
            total += 1
            file_path = Path(path)
            if not file_path.is_absolute():
                file_path = base_dir / file_path
            if file_path.exists():
                ok_count += 1
                print(f"[OK] {file_path}")
            else:
                ng_count += 1
                msg = f"[NG] {file_path} (TEST-ID={test_id}, CWE={cwe_id})"
                print(msg)
                missing_paths.append(msg)

print("\n=== Summary ===")
print(f"Checked: {total} | OK: {ok_count} | NG: {ng_count}")
if missing_paths:
    print("\nMissing paths:")
    for m in missing_paths[:20]:
        print("  " + m)
    if len(missing_paths) > 20:
        print(f"  ... and {len(missing_paths)-20} more")


[OK] ..\data\wireshark-1.2-buggy\epan\dissectors\packet-http.c
[OK] ..\data\wireshark-1.2-buggy\epan\dissectors\packet-http.c
[OK] ..\data\wireshark-1.2-buggy\epan\range.c
[OK] ..\data\wireshark-1.2-buggy\epan\radius_dict.l
[OK] ..\data\wireshark-1.2-buggy\epan\radius_dict.l
[OK] ..\data\wireshark-1.2-buggy\epan\radius_dict.c
[OK] ..\data\wireshark-1.2-buggy\epan\radius_dict.c
[OK] ..\data\wireshark-1.2-buggy\epan\radius_dict.c
[OK] ..\data\wireshark-1.2-buggy\epan\radius_dict.l
[OK] ..\data\wireshark-1.2-buggy\epan\radius_dict.c
[OK] ..\data\wireshark-1.2-buggy\epan\radius_dict.l
[OK] ..\data\wireshark-1.2-buggy\epan\radius_dict.c
[OK] ..\data\wireshark-1.2-buggy\epan\radius_dict.l
[OK] ..\data\wireshark-1.2-buggy\epan\radius_dict.c
[OK] ..\data\wireshark-1.2-buggy\epan\dissectors\packet-infiniband.c
[OK] ..\data\wireshark-1.2-buggy\epan\tvbuff.c
[OK] ..\data\wireshark-1.2-buggy\epan\dissectors\packet-rsvp.c
[OK] ..\data\wireshark-1.2-buggy\epan\dissectors\packet-rsvp.c
[OK] ..\data\w