In [None]:
import mistune

In [112]:
dltc_html_file = '../../data/dltc-workhouse/2020/2020-01-issue/02-conitzer/conitzer_v-2020.md'

In [113]:
def read_markdown_file(file_path: str) -> str:
    with open(file_path, 'r') as f:
        md = f.read()
    return md

In [114]:
md = read_markdown_file(dltc_html_file)
print(md[:4000])

---
doi: 10.48106/dial.v74.i1.02
first-page: 3
last-page: 31
title: "The Personalized A-Theory of Time and Perspective"
author:
- name: Vincent Conitzer
  email: conitzer@cs.duke.edu
  correspondence: true  
  institute: Duke University
  ORCID: 0000-0003-1899-7884
thanks: |
  I am thankful to anonymous referees who provided especially thorough and
  helpful comments, which significantly improved the paper.
output:
  pdf_document: default
  html_document:
    df_print: paged
  word_document: default
editor_options:
  markdown:
    wrap: 72
#bibliography: conitzer_v-2020.bib
bibliography: ../../../dialectica.bib
keywords: 
  - metaphysics
  - philosophy of time
  - philosophy of self
abstract: |
  A-theorists and B-theorists debate whether the "Now" is metaphysically
  distinguished from other time slices. Analogously, one may ask whether
  the "I" is metaphysically distinguished from other perspectives. Few
  philosophers would answer the second question in the affirmative. An
  except

In [115]:
parser = mistune.create_markdown(renderer="ast")

In [101]:
markdown_text = r"""---
doi: 10.48106/dial.v74.i1.02
first-page: 3
last-page: 31
title: "The Personalized A-Theory of Time and Perspective"
author:
- name: Vincent Conitzer
  email: conitzer@cs.duke.edu
  correspondence: true  
  institute: Duke University
  ORCID: 0000-0003-1899-7884
thanks: |
  I am thankful to anonymous referees who provided especially thorough and
  helpful comments, which significantly improved the paper.
output:
  pdf_document: default
  html_document:
    df_print: paged
  word_document: default
editor_options:
  markdown:
    wrap: 72
#bibliography: conitzer_v-2020.bib
bibliography: ../../../dialectica.bib
keywords: 
...
This is a regular citation [@doe99; see @smith2000, pp. 33-35; -@johnson2018, sec. 4].
Another complex citation @{https://example.com/bib?name=foobar&date=2000}, with a URL.
In-text citation without brackets: @smith2004 says blah.
And another with suffix [-@brown2021, pp. 5-10].

And simple multiple [@doe99; @smith2000] or [@doe99; -@smith2000] citations.

# Title

This is a regular citation [@doe99; see @smith2000, pp. 33-35; -@johnson2018, sec. 4].

## Subtitle

Some more text with references [@smith2004; -@doe99].

The structure of the paper is as follows. @Sec:one sketches the
linguistic objectiozn against easy arguments. @Sec:two presents
Barlew's [-@barlew_j:2017] argument to the effect that number sentences
function as identity sentences rather than as specificational sentences
in easy argument contexts, in contrast to what opponents of easy
arguments have claimed. @Sec:three argues that Barlew's argument
fails and, thus, that it is warranted to object to easy arguments on
linguistic grounds.

# A Rebuttal of Easy Arguments {#sec:one}

Paradigmatic easy arguments start from a fairly uncontroversial
assumption that does not say anything about numbers. For instance, it is
commonly assumed that Mars has two moons and, thus, that sentence [@one] is
true:

* [1]{#one} Mars has two moons.

If sentence [@one] is true, then sentence [@two] is true as well:

* [2]{#two} The number of moons of Mars is two.

But, so the argument goes, sentence [@two] is true only if numbers exist.
Hence, numbers exist!

# References

"""


In [122]:
import re


yaml_block_pattern = pattern = r"^---\n.*?\n\.\.\.\n"

md_wo_yaml = re.sub(pattern, "", md, flags=re.DOTALL | re.MULTILINE)

In [123]:
tokens = parser.parse(md_wo_yaml)

In [124]:
from pprint import pprint


pprint(tokens[0])

[{'type': 'thematic_break'},
 {'children': [{'raw': 'doi: 10.48106/dial.v74.i1.02', 'type': 'text'},
               {'type': 'softbreak'},
               {'raw': 'first-page: 3', 'type': 'text'},
               {'type': 'softbreak'},
               {'raw': 'last-page: 31', 'type': 'text'},
               {'type': 'softbreak'},
               {'raw': 'title: "The Personalized A-Theory of Time and '
                       'Perspective"',
                'type': 'text'},
               {'type': 'softbreak'},
               {'raw': 'author:', 'type': 'text'},
               {'type': 'softbreak'},
               {'raw': 'thanks: |', 'type': 'text'}],
  'type': 'paragraph'},
 {'attrs': {'depth': 0, 'ordered': False},
  'bullet': '-',
  'children': [{'children': [{'children': [{'raw': 'name: Vincent Conitzer',
                                            'type': 'text'},
                                           {'type': 'softbreak'},
                                           {'raw': 'email:

In [125]:
children_nested = [token.get('children') for token in tokens[0] if token.get('children') is not None]
children = [item for sublist in children_nested for item in sublist]
text_bits = [c.get('raw') for c in children if c.get('raw') is not None]


In [126]:
for t in text_bits:
    print(t)


doi: 10.48106/dial.v74.i1.02
first-page: 3
last-page: 31
title: "The Personalized A-Theory of Time and Perspective"
author:
thanks: |
I am thankful to anonymous referees who provided especially thorough and
helpful comments, which significantly improved the paper.
output:
pdf_document: default
html_document:
df_print: paged
word_document: default
editor_options:
markdown:
wrap: 72
#bibliography: conitzer_v-2020.bib
bibliography: ../../../dialectica.bib
keywords:
the "I" is metaphysically distinguished from other perspectives. Few
philosophers would answer the second question in the affirmative. An
exception is Caspar Hare, who has devoted two papers and a book to
arguing for such a positive answer. In this paper, I argue that those
who answer the first question in the affirmative---A-theorists---
should also answer the second question in the affirmative. This is
because key arguments in favor of the A-theory are more effective as
arguments in favor of the resulting combined position, a

In [127]:
import re


citation_pattern = re.compile(
    r'(?<!\w)\[?([-]?)@{?([a-zA-Z0-9_.:$/%&+?<>~#-]+)}?(?:,?\s*(pp?\.\s[^\];]+|sec\.\s[^\];]+|chap\.\s[^\];]+)?)?(?:,\s*([^\];]+))?\]?'
)

In [131]:
citations = []

# Iterate through tokens to find citations in text content
for t in text_bits:
    matches = citation_pattern.findall(t)
    for match in matches:
        # Structure: (author_suppression, bibkey, locator, suffix)
        citation_info = {
            "suppress_author": match[0] == "-",
            "bibkey": match[1],
            "locator": match[2] or None,
            "suffix": match[3] or None,
        }
        citations.append(citation_info)

all_keys = {c.get('bibkey') for c in citations}

non_keys = {n for n in all_keys if not ":" in n or n.startswith("sec")}

apparent_keys = all_keys - non_keys

# bibkeys = {bk for bk in apparent_keys if bk in bibliography_bibkeys}

pprint(non_keys)

# And then filter out the 'bibkeys' that are not in the bibliography's bibkeys
# Segregate those who are filtered out to analyze them later

{'s-1',
 's-2',
 'sec:appropriateness',
 'sec:direction',
 'sec:presence',
 'sec:rate',
 'sec:relativity',
 'sec:revisiting',
 'sec:revisiting2',
 'sec:travel',
 'sec:versions'}


In [132]:
pprint(apparent_keys)

{'ashby_n:2002',
 'balashov:2005b',
 'bergmann_m:1999',
 'butterfield_j:1984a',
 'cameron_rp:2015',
 'caruso_em-etal:2008',
 'chalmers_dj:2010',
 'conitzer:2015a',
 'conitzer:2019',
 'deasy:2017',
 'dieks:2006b',
 'dorr_c-goodman:2020.',
 'fine_k:2005e',
 'fine_k:2006b',
 'greene_p-sullivan:2015',
 'hare_c:2007',
 'hare_c:2009',
 'hare_c:2010a',
 'hellie:2013',
 'hurka:1993',
 'johnston_ma:2010',
 'lewis_dk:1976a',
 'liao_sy:2012',
 'lipman_m:2015',
 'markosian:2004',
 'maudlin:2002b',
 'mctaggart:1908',
 'merlo_g:2016',
 'olson_et:2009a',
 'parfit:1984',
 'price_h:2011',
 'prior_an-fine:1977',
 'prior_an:1959',
 'putnam_h:1967c',
 'reichenbach_h:1928',
 'reichenbach_h:1958',
 'sider_t:2005',
 'skow:2009a',
 'skow:2011',
 'skow:2012a.',
 'smith_njj:2011',
 'stein_h:1968',
 'suhler_c-callender:2012',
 'sullivan_m:2018',
 'turri:2013d',
 'valberg_jj:2007',
 'valberg_jj:2013',
 'williams_dc:1951a',
 'zimmerman_dw:2005',
 'zimmerman_dw:2007b',
 'zimmerman_dw:2011'}
