# Publications markdown generator for academicpages

Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)). 

The core python code is also in `pubsFromBibs.py`. 
Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:
* bib file names
* specific venue keys based on your bib file preferences
* any specific pre-text for specific files
* Collection Name (future feature)

TODO: Make this work with other databases of citations, 
TODO: Merge this with the existing TSV parsing solution

In [1]:
from pybtex.database.input import bibtex
import pybtex.database.input.bibtex
import pybtex
import time
import html
import os
import re

In [2]:
pubs = pybtex.database.parse_file('me.bib')
my_name = 'Thomas Louf'
my_cit_name = 'Louf, T.'

In [3]:
html_escape_table = {
    "&": "&amp;",
    '"': "&quot;",
    "'": "&apos;"
    }

def html_escape(text):
    """Produce entities within text."""
    return "".join(html_escape_table.get(c,c) for c in text)

In [4]:
from pybtex.plugin import find_plugin
from pybtex.database import parse_string
from pybtex.backends import markdown
APA = find_plugin('pybtex.style.formatting', 'apa')()
HTML = find_plugin('pybtex.backends', 'html')()
MD = markdown.Backend()
def bib2html(bibliography, exclude_fields=None):
    exclude_fields = exclude_fields or []
    if exclude_fields:
        bibliography = parse_string(bibliography.to_string('bibtex'), 'bibtex')
        for entry in bibliography.entries.values():
            for ef in exclude_fields:
                if ef in entry.fields.__dict__['_dict']:
                    del entry.fields.__dict__['_dict'][ef]
    formattedBib = APA.format_bibliography(bibliography)
    return [text_to_HTML(entry.text) for entry in formattedBib]

def text_to_HTML(text):
    return text.render(HTML).replace('<span class="bibtex-protected">', '').replace('</span>', '')
# def bib2md(bibliography, exclude_fields=None):
#     exclude_fields = exclude_fields or []
#     if exclude_fields:
#         bibliography = parse_string(bibliography.to_string('bibtex'), 'bibtex')
#         for entry in bibliography.entries.values():
#             for ef in exclude_fields:
#                 if ef in entry.fields.__dict__['_dict']:
#                     del entry.fields.__dict__['_dict'][ef]
#     formattedBib = APA.format_bibliography(bibliography)
#     return [entry.text.render(MD).replace('<span class="bibtex-protected">', '').replace('</span>', '') for entry in formattedBib]

In [22]:
citations = bib2html(pubs, exclude_fields=['month', 'url'])
for i, (bib_id, p) in enumerate(pubs.entries.items()):
    try:
        pub_year = p.fields['year']

        #todo: this hack for month and day needs some cleanup
        pub_month = p.fields.get('month')
        full_month = ''
        if pub_month is not None:
            if len(pub_month) < 3:
                pub_month = "0" + pub_month
                pub_month = pub_month[-2:]
            elif pub_month not in range(12):
                tmnth = time.strptime(pub_month[:3],'%b').tm_mon   
                pub_month = "{:02d}".format(tmnth) 
            else:
                pub_month = str(pub_month)
            full_month = time.strftime('%B', time.strptime(pub_month,'%m'))
        pub_day = p.fields.get('day')

        pub_date = '-'.join([x for x in [pub_year, pub_month, pub_day] if x is not None])
        if pub_month is None:
            pretty_date = pub_year
        elif pub_day is None:
            pretty_date = f'{full_month}, {pub_year}'
        else:
            pretty_date = f'{full_month} {pub_day}, {pub_year}'
        pub_title = p.fields['title']
        # strip out {} as needed (some bibtex entries that maintain formatting)
        clean_title = pub_title.replace("{", "").replace("}","").replace("\\","").replace(" ","-")    

        url_slug = re.sub("\\[.*\\]|[^a-zA-Z0-9_-]", "", clean_title).replace("--","-")

        md_filename = (str(pub_date) + "-" + url_slug + ".md").replace("--","-")
        html_filename = (str(pub_date) + "-" + url_slug).replace("--","-")

        # Build Citation from text
        citation = re.sub(r'[a-z&]{3,5};', '', html_escape(citations[i]))
        
        # add venue logic depending on citation type
        if p.type == 'inproceedings':
            venue_key = 'booktitle'
            venue_pretext = '' # 'In the proceedings of '
        elif p.type == 'article':
            venue_key = 'journal'
            venue_pretext =  ''
        venue = venue_pretext + p.fields[venue_key].replace("{", "").replace("}","").replace("\\","")
        
        authors = p.persons["author"]
        author_iter = zip(
            [' '.join(a.first_names) for a in authors],
            [' '.join(a.middle_names) for a in authors],
            [' '.join(a.last_names) for a in authors])
        authors = ', '.join([' '.join(a) for a in author_iter])
        authors = text_to_HTML(pybtex.richtext.Text.from_latex(authors))
        start_idx = authors.find(my_name)
        bolded = f'<b>{my_name}</b>'
        authors = authors[:start_idx] + bolded + authors[start_idx+len(my_name):]

        # YAML variables
        md = "---\n"
        md += "title: \'" + html_escape(pub_title.replace("{", "").replace("}","").replace("\\","")) + "'"
        md += '\ncollection: ' +  "'publications'"
        md += '\npermalink: ' + "/publication/"  + html_filename
        md += "\nauthors: '" + authors + "'"
        md += "\nprintdate: " + pretty_date
        md += "\nvenue: '" + html_escape(venue) + "'"
        url = p.fields.get('url')
        if url:
            md += "\npaperurl: '" + p.fields["url"] + "'"
        md += "\ncitation: '" + citation + "'"
        md += "\ntype: '" + p.type + "'"
        md += "\n---"
        
        # Content
        md += '\n' + text_to_HTML(pybtex.richtext.Text.from_latex(p.fields['abstract']))
        md_filename = os.path.basename(md_filename)

        with open("../_publications/" + md_filename, 'w') as f:
            f.write(md)
        print(f'SUCESSFULLY PARSED {bib_id}: \"', pub_title[:60],"..."*(len(pub_title)>60),"\"")
    # field may not exist for a reference
    except KeyError as e:
        print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \"', pub_title[:30],"..."*(len(pub_title)>30),"\"")
        continue

SUCESSFULLY PARSED Ganic2020: " {Dynamic noise maps for Ljubljana airport}  "
