In [1]:
import sys, math, json, pandas as pd, pyarrow.parquet as pq
from pathlib import Path

# Текущая рабочая директория Jupyter
cwd = Path.cwd()

# Если CWD = Project/notebooks, то корень проекта = parent
project_root = cwd.parent

# Добавляем корень проекта в sys.path (в начало, чтобы он имел приоритет)
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from processing_cpp import compile_get_error_info, clear_build_tmp, safe_extract_context
from config import TEMP_OUTPUT_DIR, PARQUETS_DIR, CTX_JSONLS_DIR

In [None]:
df = pd.read_parquet("../errors_cpp_codes.parquet")

In [None]:
df_slice = df[70000:71000].copy()

In [None]:
from concurrent.futures import ThreadPoolExecutor


def bulk_get_error_text_batch(codes: list[str], max_workers: int = 4) -> list[tuple[str | None, int | None]]:
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        return list(ex.map(compile_get_error_info, codes))

def fill_error_text_in_batches(df : pd.DataFrame, batch_size : int = 500, max_workers : int = 4) -> pd.DataFrame:
    if "error_text" not in df.columns:
        df["error_text"] = None
    if "error_line" not in df.columns:
        df["error_line"] = pd.Series([None] * len(df), dtype="Int64")
    n = len(df)
    if n == 0:
        return df
    
    for start in range(0, n, batch_size):
        stop = min(start + batch_size, n)
        batch_index = df.index[start:stop]

        codes = df.loc[batch_index, "source_code"].tolist()

        results = bulk_get_error_text_batch(codes, max_workers=max_workers)
        error_texts = [r[0] for r in results]
        error_lines = [r[1] for r in results]
        df.loc[batch_index, "error_text"] = error_texts
        df.loc[batch_index, "error_line"] = error_lines

        print(f"{stop}/{n}")

    return df

In [None]:
df_slice = fill_error_text_in_batches(df_slice, batch_size=500, max_workers=6)

In [None]:
clear_build_tmp()

In [3]:
pd.set_option("display.width", 2000)     # максимальная ширина в символах
pd.set_option("display.max_colwidth", None)

In [None]:
codes = {
    'C2065',
    'C3861',
    'C2143',
    'C2146',
    'C2059',
    'C1075',
    'C1083',
    'C2131',
    'C2440',
    'C2446',
    'C2676',
    'C2678',
    'C2679',
    'C2039',
    'C2672',
    'C2144',
    'C2187',
    'C2148',
    'C2064',
    'C2181',
    'C2106',
}

pattern = '|'.join(codes)
msk = df_correct["error_text"].str.contains(pattern, regex=True)
df_correct = df_correct[msk]

In [2]:
need = pd.read_parquet(PARQUETS_DIR / "data_filtered_error_codes.parquet")

In [3]:
len(need)

225693

In [4]:
msk2 = need["source_code"].str.contains(r'#\s*(?:define|undef|if|ifdef|ifndef|elif|pragma)', regex=True)
msk3 = need["error_code"] == "C1083"

In [5]:
# need_macros = need[msk2 | msk3]
need_without_macros = need[~msk2 & ~msk3]

In [6]:
len(need_without_macros)

154276

In [37]:
def strip_nones(obj):
    if isinstance(obj, dict):
        cleaned = {}
        for k, v in obj.items():
            if v is None:
                continue
            v_clean = strip_nones(v)
            if isinstance(v_clean, (dict, list)) and not v_clean:
                continue
            cleaned[k] = v_clean
        return cleaned

    if isinstance(obj, list):
        cleaned_list = []
        for v in obj:
            if v is None:
                continue
            v_clean = strip_nones(v)
            if isinstance(v_clean, (dict, list)) and not v_clean:
                continue
            cleaned_list.append(v_clean)
        return cleaned_list
    return obj

def clean_jsonl(in_path : str | Path, out_path : str | Path) -> None:
    in_path = Path(in_path)
    out_path = Path(out_path)

    with in_path.open("r", encoding="utf-8") as fin, out_path.open("w", encoding="utf-8") as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue

            obj = json.loads(line)
            ctx = obj.get("ctx")
            if ctx:
                clean_ctx = strip_nones(ctx)
                obj["ctx"] = clean_ctx

            fout.write(json.dumps(obj, ensure_ascii=False))
            fout.write("\n")

In [None]:
clean_jsonl(CTX_JSONLS_DIR / "third_part_without_macros.jsonl", CTX_JSONLS_DIR / "third_part_without_macros_clean.jsonl")

In [None]:
def export_ctx_parquet_to_jsonl(src_path: str, out_jsonl: str, batch_size: int = 2000) -> None:
    pf = pq.ParquetFile(src_path)
    total_rows = pf.metadata.num_rows if pf.metadata is not None else None

    batch_idx = 0
    written = 0

    with open(out_jsonl, "w", encoding="utf-8") as f_out:
        for batch in pf.iter_batches(columns=["__index_level_0__", "ctx"], batch_size=batch_size, use_pandas_metadata=True):
            batch_idx += 1

            df_batch = batch.to_pandas()

            df_batch = df_batch.reset_index()
            df_batch.rename(
                    columns={"index": "row_id"},
                    inplace=True,
            )

            jsonl = df_batch.to_json(
                orient="records",
                lines=True,
                force_ascii=False
            )

            f_out.write(jsonl)
            written += len(df_batch)

            if total_rows is not None:
                print(f"batch {batch_idx}: {written}/{total_rows} строк")
            else:
                print(f"batch {batch_idx}: +{len(df_batch)} строк")

    print(f"Готово, всего записано {written} строк в {out_jsonl}")

In [None]:
export_ctx_parquet_to_jsonl("third_part_normalize_source_code.parquet", "third_part.jsonl")

batch 1: 2000/48000 строк
batch 2: 4000/48000 строк
batch 3: 6000/48000 строк
batch 4: 8000/48000 строк
batch 5: 10000/48000 строк
batch 6: 12000/48000 строк
batch 7: 14000/48000 строк
batch 8: 16000/48000 строк
batch 9: 18000/48000 строк
batch 10: 20000/48000 строк
batch 11: 22000/48000 строк
batch 12: 24000/48000 строк
batch 13: 26000/48000 строк
batch 14: 28000/48000 строк
batch 15: 30000/48000 строк
batch 16: 32000/48000 строк
batch 17: 34000/48000 строк
batch 18: 36000/48000 строк
batch 19: 38000/48000 строк
batch 20: 40000/48000 строк
batch 21: 42000/48000 строк
batch 22: 44000/48000 строк
batch 23: 46000/48000 строк
batch 24: 48000/48000 строк
Готово, всего записано 48000 строк в third_part.jsonl


In [8]:
def get_row_ids(ctx_jsonl_path: str | Path) -> list[int]:
    ids = []
    ctx_jsonl_path = Path(ctx_jsonl_path)

    with open(ctx_jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            rec = json.loads(line)
            row_id = rec["row_id"]
            ids.append(row_id)
    return ids

In [11]:
def is_ctx_effectively_empty(obj) -> bool:
    if obj is None:
        return True
    if isinstance(obj, dict):
        if not obj:
            return True
        return all(is_ctx_effectively_empty(v) for v in obj.values())
    if isinstance(obj, list):
        if not obj:
            return True
        return all(is_ctx_effectively_empty(v) for v in obj)
    return False

def filter_jsonl(in_path: str | Path, out_path: str | Path) -> tuple[list[int], list[int]]:
    in_path = Path(in_path)
    out_path = Path(out_path)

    total = 0
    kept = 0
    dropped = 0
    bad_ids = []
    good_ids = []

    with in_path.open("r", encoding="utf-8") as fin, out_path.open("w", encoding="utf-8") as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue

            total += 1
            obj = json.loads(line)

            ctx = obj.get("ctx")

            if is_ctx_effectively_empty(ctx):
                dropped += 1
                bad_ids += [obj.get("row_id")]
                continue

            # if "__index_level_0__" in obj:
            #     row_id_val = obj.pop("__index_level_0__")
            #     obj.setdefault("row_id", row_id_val)

            fout.write(json.dumps(obj, ensure_ascii=False))
            fout.write("\n")
            kept += 1
            good_ids += [obj.get("row_id")]
    print(f"Всего строк: {total}, сохранено: {kept}, удалено пустых ctx: {dropped}")
    return good_ids, bad_ids

In [12]:
good_ids, bad_ids = filter_jsonl(CTX_JSONLS_DIR / "third_part_without_macros.jsonl", CTX_JSONLS_DIR / "third_part_without_macros_clean.jsonl")

Всего строк: 66277, сохранено: 66267, удалено пустых ctx: 10


In [None]:
def attach_ctx_chunked(df: pd.DataFrame, ctx_jsonl_path: str | Path, ctx_column: str = "ctx", chunksize: int = 10_000) -> pd.DataFrame:
    """
    Читает JSONL батчами и заполняет колонку df[ctx_column]
    по индексам row_id.
    """
    ctx_jsonl_path = Path(ctx_jsonl_path)

    # Если колонки пока нет — создаём
    if ctx_column not in df.columns:
        df[ctx_column] = None

    # Читаем JSONL порциями
    reader = pd.read_json(
        ctx_jsonl_path,
        lines=True,
        chunksize=chunksize,
    )

    total_chunks = 0
    for chunk in reader:
        total_chunks += 1
        if "row_id" not in chunk.columns:
            raise ValueError("В JSONL нет колонки 'row_id'")

        # Ставим row_id как индекс
        chunk = chunk.set_index("row_id")

        # Заполняем df по индексам
        # Предполагается, что индексы df совпадают с row_id
        df.loc[chunk.index, ctx_column] = chunk[ctx_column]

        print(f"Обработан chunk {total_chunks}, строк: {len(chunk)}")

    print(f"Готово, обработано chunk'ов: {total_chunks}")
    return df


In [23]:
def merge_jsonl_files(in_paths, out_path: str | Path, check_duplicates: bool = True) -> None:
    out_path = Path(out_path)
    seen_ids = set()

    total_in = 0
    total_out = 0

    with out_path.open("w", encoding="utf-8") as fout:
        for p in in_paths:
            p = Path(p)
            print(f"Обрабатываем файл: {p}")
            with p.open("r", encoding="utf-8") as fin:
                for line in fin:
                    line = line.strip()
                    if not line:
                        continue

                    total_in += 1
                    obj = json.loads(line)

                    if "row_id" not in obj:
                        raise ValueError(f"В файле {p} объект без 'row_id': {obj}")
                    if "ctx" not in obj:
                        raise ValueError(f"В файле {p} объект без 'ctx': {obj}")

                    row_id = obj["row_id"]

                    if check_duplicates:
                        if row_id in seen_ids:
                            raise ValueError(
                                f"Дубликат row_id={row_id!r} в файле {p}. "
                                "Ожидалось, что row_id не повторяются."
                            )
                        seen_ids.add(row_id)

                    fout.write(json.dumps(obj, ensure_ascii=False))
                    fout.write("\n")
                    total_out += 1

    print(
        f"Готово. Прочитано строк: {total_in}, "
        f"записано в {out_path}: {total_out}. "
        f"Уникальных row_id: {len(seen_ids) if check_duplicates else 'не проверяли'}"
    )

In [32]:
files = [
    CTX_JSONLS_DIR / "first_part.jsonl",
    CTX_JSONLS_DIR / "second_part.jsonl",
    CTX_JSONLS_DIR / "third_part.jsonl",
    CTX_JSONLS_DIR / "fourth_part.jsonl",
]
merge_jsonl_files(files, CTX_JSONLS_DIR / "data_with_macros.jsonl")

Обрабатываем файл: C:\Users\olegk\Downloads\Telegram Desktop\Лабы\Технологии проектирования ПО\Project\notebooks\ctx_jsonls\first_part.jsonl
Обрабатываем файл: C:\Users\olegk\Downloads\Telegram Desktop\Лабы\Технологии проектирования ПО\Project\notebooks\ctx_jsonls\second_part.jsonl
Обрабатываем файл: C:\Users\olegk\Downloads\Telegram Desktop\Лабы\Технологии проектирования ПО\Project\notebooks\ctx_jsonls\third_part.jsonl
Обрабатываем файл: C:\Users\olegk\Downloads\Telegram Desktop\Лабы\Технологии проектирования ПО\Project\notebooks\ctx_jsonls\fourth_part.jsonl
Готово. Прочитано строк: 71400, записано в C:\Users\olegk\Downloads\Telegram Desktop\Лабы\Технологии проектирования ПО\Project\notebooks\ctx_jsonls\data_with_macros.jsonl: 71400. Уникальных row_id: 71400


In [None]:
def ensure_utf8_text(x):
    # если вдруг bytes -> пытаемся прочитать как utf-8, при проблемах заменяем
    if isinstance(x, (bytes, bytearray)):
        return x.decode("utf-8", errors="replace")
    # если str, но при кодировании в utf-8 вылезают суррогаты -> заменяем проблемные места
    try:
        x.encode("utf-8")
        return x
    except UnicodeEncodeError:
        return x.encode("utf-8", errors="replace").decode("utf-8")

In [15]:
remain_need_macros["source_code"] = remain_need_macros["source_code"].map(ensure_utf8_text)

In [8]:
from concurrent.futures import ThreadPoolExecutor

def compute_ctx_worker(args : tuple[str, int, int]) -> dict:
    source_code, error_line, radius = args
    try:
        return safe_extract_context(source_code, error_line, False, radius)
    except Exception as e:
        print(f"error, ctx_failed: {e!r}, line: {error_line}")
        return {}
    
    
def add_ctx_parallel_to_jsonl(df : pd.DataFrame, radius: int, out_jsonl : str | Path, max_workers : int = 4, batch_size : int = 1000) -> None:
    out_jsonl = Path(out_jsonl)
    n = len(df)
    if n == 0:
        print("DataFrame пустой")
        return
    
    row_index = df.index.to_numpy(copy=False)
    source_values = df["source_code"].to_numpy(copy=False)
    line_values = df["error_line"].to_numpy(copy=False)

    total_batches = math.ceil(n / batch_size)
    print(f"Всего строк: {n}, батчей: {total_batches}, max_workers: {max_workers}, batch_size: {batch_size}")

    with open(out_jsonl, "w", encoding="utf-8") as f_out:
        with ThreadPoolExecutor(max_workers=max_workers) as ex:
            for b_idx, start in enumerate(range(0, n, batch_size), start=1):
                stop = min(start + batch_size, n)

                batch_row_ids = row_index[start:stop]
                batch_sources = source_values[start:stop]
                batch_lines = line_values[start:stop]

                args_iter = ((s, l, radius) for s, l in zip(batch_sources, batch_lines))
                results = list(ex.map(compute_ctx_worker, args_iter))

                df_batch = pd.DataFrame({
                        "row_id": batch_row_ids,
                        "ctx": results,
                    })
                json_str = df_batch.to_json(
                    orient="records",
                    lines=True,
                    force_ascii=False,
                )
                f_out.write(json_str)
                print(f"[{b_idx}/{total_batches}] обработано строк: {stop}/{n}")

    print("Обработка всех батчей завершена")
    return


def add_ctx_sequential_to_jsonl(df: pd.DataFrame, radius: int, out_jsonl : str | Path, batch_size: int = 1000) -> None:
    out_jsonl = Path(out_jsonl)
    n = len(df)
    if n == 0:
        return

    row_index = df.index.to_numpy(copy=False)
    source_values = df["source_code"].to_numpy(copy=False)
    line_values = df["error_line"].to_numpy(copy=False)

    total_batches = math.ceil(n / batch_size)
    print(f"Всего строк: {n}, батчей: {total_batches}, batch_size: {batch_size} (SEQUENTIAL)")

    with out_jsonl.open("w", encoding="utf-8") as f_out:
        for b_idx, start in enumerate(range(0, n, batch_size), start=1):
            stop = min(start + batch_size, n)

            batch_row_ids = row_index[start:stop]
            batch_sources = source_values[start:stop]
            batch_lines = line_values[start:stop]

            results = []
            for src, line in zip(batch_sources, batch_lines):
                ctx = compute_ctx_worker((src, line, radius))
                results.append(ctx)

            df_batch = pd.DataFrame({
                    "row_id": batch_row_ids,
                    "ctx": results,
                })
            json_str = df_batch.to_json(
                orient="records",
                lines=True,
                force_ascii=False,
            )
            f_out.write(json_str)
            print(f"[{b_idx}/{total_batches}] обработано строк: {stop}/{n}")

    print("Обработка всех батчей (последовательно) завершена")
    return

In [None]:
add_ctx_sequential_to_jsonl(need_without_macros, 2, CTX_JSONLS_DIR / "first_part_without_macros.jsonl", 50)

In [None]:
add_ctx_parallel_to_jsonl(remain_need, 2, CTX_JSONLS_DIR / "third_part_without_macros.jsonl", 3, 2000)