In [224]:
import mistune
import re
from typing import List
from pprint import pprint

def read_file(file_path: str) -> str:
    with open(file_path, 'r', encoding='utf-8') as f:
        md = f.read()
    return md


def remove_yaml_front_matter(content: str) -> str:
    if content.startswith('---'):
        # Split YAML front matter from the rest of the content
        yaml_delimiter = content.find('\n---', 3)  # Look for the closing '---'
        if yaml_delimiter != -1:
            return content[yaml_delimiter + 3:].lstrip()  # Strip YAML block
        elif yaml_delimiter := content.find('\n...', 3):
            if yaml_delimiter != -1:
                return content[yaml_delimiter + 3:].lstrip()
    return content

In [225]:
dltc_md = '../../data/dltc-workhouse/2020/2020-01-issue/02-conitzer/conitzer-2020.md'
md = read_file(dltc_md)
#print(repr(md[:4000]))
md_wo_yaml = remove_yaml_front_matter(md)

#print(md[:4000])
#print(md_wo_yaml[:4000])

In [226]:
def get_text_bits(md_content: str, parser: mistune.Markdown) -> tuple[str]:

    tokens = parser.parse(md_content)

    children_nested = (token.get('children') for token in tokens[0] if token.get('children') is not None)

    children = (item for sublist in children_nested for item in sublist)

    text_bits = tuple(c.get('raw') for c in children if c.get('raw') is not None)

    return text_bits


In [227]:
parser = mistune.create_markdown(renderer="ast")

text_bits = get_text_bits(md_wo_yaml, parser)

for t in text_bits[:10]:
    print(t)


In a series of unconventional but lucid works, Caspar Hare has laid out
and defended a theory of 
 (or, in his more recent
work, 
), in which a distinguished individual's
experiences are 
 in a way that the experiences of others are
not 
[
-@hare_c:2007; -@hare_c:2009; -@hare_c:2010a]. Closely related ideas


In [228]:
citation_pattern = re.compile(
    r'(?<!\w)\[?([-]?)@{?([a-zA-Z0-9_.:$/%&+?<>~#-]+)}?(?:,?\s*(pp?\.\s[^\];]+|sec\.\s[^\];]+|chap\.\s[^\];]+)?)?(?:,\s*([^\];]+))?\]?'
)

In [None]:
def get_citations(text_bits: List[str], citation_pattern: re.Pattern) -> List[dict]:
    citations = []

    # Iterate through tokens to find citations in text content
    for t in text_bits:
        matches = citation_pattern.findall(t)
        for match in matches:
            # Structure: (author_suppression, bibkey, locator, suffix)
            citation_info = {
                "suppress_author": match[0] == "-",
                "bibkey": match[1],
                "locator": match[2] or None,
                "suffix": match[3] or None,
            }
            citations.append(citation_info)

    return citations

citations = get_citations(text_bits)


def get_keys(citations: List[dict]) -> tuple[set[str], set[str], set[str]]:

    all_keys = {c.get('bibkey') for c in citations}

    non_keys = {n for n in all_keys if not ":" in n or n.startswith("sec")}

    apparent_keys = all_keys - non_keys

    return all_keys, non_keys, apparent_keys


def biblio_keys(keys: set[str], all_bibkeys: tuple[str]) -> tuple[set[str], set[str]]:

    bibkeys = {bk for bk in keys if bk in all_bibkeys}

    non_bibkeys = keys - bibkeys

    return bibkeys, non_bibkeys


In [230]:
all_keys, non_keys, apparent_keys = get_keys(citations)

print("NON KEYS\n")
pprint(non_keys)
print("\n\nAPPARENT KEYS\n")
pprint(list(apparent_keys)[:10])

NON KEYS

{'s-1',
 's-2',
 'sec:appropriateness',
 'sec:direction',
 'sec:presence',
 'sec:rate',
 'sec:relativity',
 'sec:revisiting',
 'sec:revisiting2',
 'sec:travel',
 'sec:versions'}


APPARENT KEYS

['hare_c:2007',
 'valberg_jj:2007',
 'prior_an:1959',
 'zimmerman_dw:2007b',
 'skow:2012a.',
 'putnam_h:1967c',
 'olson_et:2009a',
 'suhler_c-callender:2012',
 'skow:2009',
 'turri:2013d']


In [231]:
all_bibkeys = ("hare_c:2007", "skow:2009")

bibkeys, non_bibkeys = biblio_keys(apparent_keys, all_bibkeys)

print("\n\nBIBKEYS\n")
pprint(bibkeys)
print("\n\nNON BIBKEYS\n")
pprint(list(non_bibkeys)[:10])



BIBKEYS

{'skow:2009', 'hare_c:2007'}


NON BIBKEYS

['valberg_jj:2007',
 'prior_an:1959',
 'zimmerman_dw:2007b',
 'skow:2012a.',
 'putnam_h:1967c',
 'olson_et:2009a',
 'suhler_c-callender:2012',
 'caruso_em-etal:2008',
 'turri:2013d',
 'fine_k:2005e']


In [None]:
d = {
    x: "a"
    for x in range(10)
}

TypeError: unhashable type: 'dict'

In [235]:
from src.sdk.utils import remove_extra_whitespace
from pathlib import Path

def gen_dltc_filename(
    path: Path
) -> str:
    stem = path.stem
    stripped = remove_extra_whitespace(stem)

    # Replace the last occurrence of "-" with ":"
    last_hyphen = stripped.rfind("-")

    if last_hyphen != -1:
        return f"{stripped[:last_hyphen]}:{stripped[last_hyphen + 1:]}"
    else:
        return stripped


In [236]:
root_path = Path("../../data/dltc-workhouse")
all_mds = list(root_path.rglob("*.md"))



In [238]:
dltc_filenames = {gen_dltc_filename(p) for p in all_mds}

In [242]:
found = ('licon-2023', 'contu-2023', 'nolan_dp-2023', 'ketland-2023', 'guengoer-2023', 'smit_jp-buekens-2023', 'spohn-2023', 'miller_k-2023', 'rooney_jd-2023', 'zhang_l-horsten-2023', 'simsek_m-2023', 'oconaill_d-pearson-2023', 'billon_a-2023', 'kappes_y-2023', 'janssenlauret-2023', 'leuenberger_s-2023', 'patterson_a-2023', 'pagano_e-2023', 'galli_s-2023', 'humbertdroz-2023', 'baehni_a-2023', 'lanzet_r-2023', 'simon_jon-2023', 'deflorio-frigerio-2023', 'nicolas_d-2021', 'weintraub_r-2021', 'schneider_le-2021', 'laasik-2021', 'vanrie-vandyck-2021', 'burnston-2021', 'caputo_s-2021', 'andow_j-2021', 'berghofer_p-2021', 'dees_mk-2021', 'paterson_n-2021', 'mueller_jm-2021', 'francescotti-2021', 'milona-2021', 'krstic-2021', 'cohnitz-nicolai-2021', 'coates_as-2021', 'melchior-2021', 'warren_ja-2021', 'ronnowrasmussen-2021a', 'gessell_b-2021', 'sekatskaya-schurz-2021', 'pismenny-2021', 'schwabe_u-2021', 'muehlebach_d-2021', 'nicolas_d-2021', 'djordjevic-2020', 'reinmuth_f-2020', 'dicher-2020c', 'benyami-2020', 'cook_rt-2020', 'kim_jo-2020', 'michels_r-2020', 'sagi_gi-2020', 'liberman_a-2020', 'donaldson_t-2020', 'stevens_g-2020', 'cox_r-2020', 'finocchiaro_p-2020', 'macbride-2020', 'blum_ph-2020', 'roelofs_l-2020', 'schaerli_m-2020', 'elzein-2020', 'taieb_h-2020', 'goh_hxj-2020', 'oconaill_d-2020', 'conitzer-2020', 'daoust_mk-2020a', 'wilson_al-2020b', 'daly_ch-2020', 'baratella_r-2020', 'zimmerman_dw-2020', 'dixon_j-2020', 'muller_fa-2020', 'campdelacreu-etal-2020', 'stephenson_a-2020', 'morvarid_m-2020', 'vollet_jh-2020', 'felka-2022', 'andreas-guenther-2022', 'stemeroff-2022', 'billon_a-vellozzo-2022', 'solomyak-2022', 'bernath_l-paar-2022', 'schuman_bf-2022', 'irikefe-2022', 'winokur-2022', 'marshall_d-2022', 'sheng_ee-2022', 'fischer_s-2022', 'mizrahi_v-2022', 'busse_r-2022', 'dixon_s-2022', 'zalta-2022', 'orilia-2022', 'plate_j-2022', 'macbride-2022', 'macbride-orilia-2022', 'leo_j-2022')



In [246]:
sorted(found)

['andow_j-2021',
 'andreas-guenther-2022',
 'baehni_a-2023',
 'baratella_r-2020',
 'benyami-2020',
 'berghofer_p-2021',
 'bernath_l-paar-2022',
 'billon_a-2023',
 'billon_a-vellozzo-2022',
 'blum_ph-2020',
 'burnston-2021',
 'busse_r-2022',
 'campdelacreu-etal-2020',
 'caputo_s-2021',
 'coates_as-2021',
 'cohnitz-nicolai-2021',
 'conitzer-2020',
 'contu-2023',
 'cook_rt-2020',
 'cox_r-2020',
 'daly_ch-2020',
 'daoust_mk-2020a',
 'dees_mk-2021',
 'deflorio-frigerio-2023',
 'dicher-2020c',
 'dixon_j-2020',
 'dixon_s-2022',
 'djordjevic-2020',
 'donaldson_t-2020',
 'elzein-2020',
 'felka-2022',
 'finocchiaro_p-2020',
 'fischer_s-2022',
 'francescotti-2021',
 'galli_s-2023',
 'gessell_b-2021',
 'goh_hxj-2020',
 'guengoer-2023',
 'humbertdroz-2023',
 'irikefe-2022',
 'janssenlauret-2023',
 'kappes_y-2023',
 'ketland-2023',
 'kim_jo-2020',
 'krstic-2021',
 'laasik-2021',
 'lanzet_r-2023',
 'leo_j-2022',
 'leuenberger_s-2023',
 'liberman_a-2020',
 'licon-2023',
 'macbride-2020',
 'macbride-20