In [1]:
big_bib = "/home/alebg/philosophie-ch/Dropbox/philosophie-ch/dltc-biblio/biblio.bib"

In [2]:
def extract_bibkey(line: str) -> str:
    bracket_split = line.split("{")
    comma_split = bracket_split[1].split(",")
    bibkey = comma_split[0]
    return bibkey


In [None]:
from typing import Literal, NamedTuple, Tuple


class Bibkey(NamedTuple):
    first_author: str
    other_authors: str | None
    year: int | Literal["unpub", "forthcoming"] | str
    year_suffix: str | None

class BibkeyError(NamedTuple):
    text: str
    position: int
    error: str

    def __str__(self) -> str:
        return f"ERROR: '{self.text}' at line {self.position} --- '{self.error}'"


def parse_bibkey(text: str, text_position_d: dict[str, int]) -> Bibkey | BibkeyError:
    """
    return either a Bibkey object, or a BibkeyError object to indicate a parsing error
    """
    try:
        warning_flag = False

        parts = text.split(":")
        if len(parts) != 2:
            raise ValueError(f"Unexpected number of bibkey parts for '{text}': '{parts}'")

        author_parts = parts[0].split("-")
        year_parts = parts[1]

        if len(author_parts) == 1:
            first_author = author_parts[0]
            other_authors = None
        elif len(author_parts) == 2:
            first_author = author_parts[0]
            other_authors = author_parts[1]
        else:
            raise ValueError(f"Unexpected bibkey author parts for '{text}': '{author_parts}'")
        
        char_index_type_d = {i: (char, char.isdigit()) for i, char in enumerate(year_parts)}

        year_l: list[str] = []
        int_breakpoint = None
        for value in char_index_type_d.items():
            i, (char, is_digit) = value
            if is_digit:
                year_l.append(char)
                int_breakpoint = i
            else:
                break
        
        if year_l != []:
            year = int(f"{''.join(year_l)}")
        else:
            year = None

        if int_breakpoint is not None:
            year_suffix = year_parts[int_breakpoint + 1:]
            if year_suffix == "":
                year_suffix = None
        else:
            # all characters are non-digits
            year_suffix = "".join(year_parts)
        
        if year_suffix != None and year_suffix not in ["unpub", "forthcoming"] and len(year_suffix) > 1:
            suffix_tail = year_suffix[1:]
            if not suffix_tail.isdigit():
                print(f"WARNING: Unexpected year suffix for '{text}': '{year_suffix}'")
        
        if year is None and year_suffix is None:
            raise ValueError(f"Could not parse year for '{text}': '{year_parts}'")
        
        if year is None:
            year = year_suffix
            year_suffix = None
        
        return Bibkey(first_author, other_authors, year, year_suffix)

    except ValueError as e:
        return BibkeyError(text, text_position_d[text], str(e))

In [4]:
with open(big_bib, "r") as f:
    bibkey_linenum_d = {
        extract_bibkey(line): i+1
        for i, line in enumerate(f.readlines())
    }


with open(big_bib, "r") as f:
    big_bib_len = len(f.readlines())


In [5]:
with open(big_bib, "r") as f:
    all_bibkeys = tuple(extract_bibkey(line) for line in f.readlines())

all_bibkeys_len = len(all_bibkeys)
assert all_bibkeys_len == big_bib_len, f"Expected {big_bib_len} bibkeys, found {all_bibkeys_len}"

all_bibkeys_uniq_len = len(frozenset(all_bibkeys))
assert all_bibkeys_len == all_bibkeys_uniq_len, f"Duplicate bibkeys found: {all_bibkeys_len} total, {all_bibkeys_uniq_len} unique"

parse_result = tuple(parse_bibkey(bk, bibkey_linenum_d) for bk in all_bibkeys)

error_results = tuple(result for result in parse_result if isinstance(result, BibkeyError))

if error_results:
    print(f"\n\nFOUND {len(error_results)} ERRORS!!!!\n\n")
    for error_result in error_results:
        print(error_result)



FOUND 3 ERRORS!!!!


ERROR: 'adorno_tw:kracauer:2020' at line 1155 --- 'Unexpected number of bibkey parts for 'adorno_tw:kracauer:2020': '['adorno_tw', 'kracauer', '2020']''
ERROR: 'lehrer_pm:etal:1986' at line 110196 --- 'Unexpected number of bibkey parts for 'lehrer_pm:etal:1986': '['lehrer_pm', 'etal', '1986']''
ERROR: 'open-science-collaboration:2015' at line 141089 --- 'Unexpected bibkey author parts for 'open-science-collaboration:2015': '['open', 'science', 'collaboration']''
