In [16]:
import re
import pandas as pd


def read_sheet(ref):
    m = re.search(r'(\d+)\s*(об\.)?', ref)
    if not m:
        return None
    return int(m.group(1)), bool(m.group(2))


def sheet_to_value(n, ob):
    return 2 * n + (1 if ob else 0)


def value_to_sheet(v):
    return v // 2, (v % 2 == 1)


def expand_range_piece(segment):
    segment = segment.strip()
    if re.search(r'[–—-]', segment):
        left, right = re.split(r'[–—-]', segment, maxsplit=1)
        a = read_sheet(left.strip())
        b = read_sheet(right.strip())
        if not a or not b:
            return []
        av = sheet_to_value(*a)
        bv = sheet_to_value(*b)
        step = 1 if bv >= av else -1

        out = []
        for v in range(av, bv + step, step):
            n, ob = value_to_sheet(v)
            out.append(f"{n} об." if ob else f"{n}")
        return out

    p = read_sheet(segment)
    if not p:
        return []
    n, ob = p
    return [f"{n} об." if ob else f"{n}"]


def extract_sheets(block):
    # убираем цитаты и круглые скобки (описания, даты и т.п.)
    block = re.sub(r'«[^»]*»', '', block)
    block = re.sub(r'\([^()]*\)', '', block)
    # выкидываем упоминания других ПД внутри блока листов
    block = re.sub(r'ПД\s*\d+', '', block)
    # убираем "л."
    block = re.sub(r'л\.\s*', '', block)

    pattern = r'\d+\s*(?:об\.)?(?:\s*[–—-]\s*\d+\s*(?:об\.)?)?'
    segments = re.findall(pattern, block)

    out = []
    for seg in segments:
        out.extend(expand_range_piece(seg))
    return out


def generate_full_pd(pd_name, limit=5):
    out = []
    for i in range(1, limit + 1):
        out.append(f"{pd_name} л. {i}")
        out.append(f"{pd_name} л. {i} об.")
    return out


def expand_pd_interval(start, end, limit=5):
    out = []
    step = 1 if end >= start else -1
    for num in range(start, end + step, step):
        out.extend(generate_full_pd(f"ПД {num}", limit))
    return out


def parse_record(text):
    if pd.isna(text):
        return []

    s = str(text)

    # "в тетр. 834" -> "в тетр. ПД 834"
    s = re.sub(r'(тетр\.\s*)(\d+)', r'\1ПД \2', s)
    # "ПД, ф. 244" -> "ПД 244"
    s = re.sub(r'ПД,\s*ф\.\s*(\d+)', r'ПД \1', s)

    # блок листов: "л. ..." — НЕ захватываем внутрь "ПД ..."
    leaf_pattern = r'(л\.\s*(?:(?!ПД).)*?)(?=(?:ПД|;|$))'

    token_pattern = re.compile(
        r'(ПД\s*\d+\s*[–—-]\s*ПД\s*\d+)|'  # ПД A — ПД B
        r'(ПД\s*\d+)|'                      # отдельный ПД
        + leaf_pattern,                    # блок листов до следующего ПД/; или конца
        flags=re.IGNORECASE | re.DOTALL
    )

    tokens = []
    pd_has_sheets = {}
    current_pd = None

    # ---------- PASS 1: токенизация ----------
    for m in token_pattern.finditer(s):
        interval = m.group(1)
        pd_token = m.group(2)
        block = m.group(3)

        if interval:
            nums = re.findall(r'ПД\s*(\d+)', interval)
            if len(nums) == 2:
                a, b = map(int, nums)
                tokens.append({"type": "interval", "a": a, "b": b})
                step = 1 if b >= a else -1
                for n in range(a, b + step, step):
                    pd_has_sheets[f"ПД {n}"] = True
            current_pd = None
            continue

        if pd_token:
            num = re.search(r'ПД\s*(\d+)', pd_token).group(1)
            pd_name = f"ПД {num}"
            current_pd = pd_name
            if pd_name not in pd_has_sheets:
                pd_has_sheets[pd_name] = False
            tokens.append({"type": "pd", "pd": pd_name})
            continue

        if block and current_pd is not None:
            preview = extract_sheets(block)
            if preview:
                tokens.append({"type": "sheets", "pd": current_pd, "block": block})
                pd_has_sheets[current_pd] = True

    # ---------- PASS 2: собираем результат по порядку ----------
    result = []

    for t in tokens:
        kind = t["type"]

        if kind == "interval":
            result.extend(expand_pd_interval(t["a"], t["b"]))

        elif kind == "pd":
            pd_name = t["pd"]
            if not pd_has_sheets.get(pd_name, False):
                result.extend(generate_full_pd(pd_name))

        elif kind == "sheets":
            expanded = extract_sheets(t["block"])
            for sref in expanded:
                result.append(f"{t['pd']} л. {sref}")

    return result


def convert_file(path_in, path_out):
    df = pd.read_csv(path_in, encoding='utf-8')
    df['autographs_parsed'] = df['autographs'].apply(
        lambda x: '; '.join(parse_record(x))
    )
    df.to_csv(path_out, index=False, encoding='utf-8-sig')


In [17]:
convert_file("variant_5.csv", "parsed_variant_5.csv")