In [20]:
import os
import re
import shutil

# Get the current directory of the Jupyter notebook
notebook_dir = os.getcwd()

# Navigate one folder above
script_dir = os.path.abspath(os.path.join(notebook_dir, '..'))

# Define a regular expression pattern to extract the necessary details from the bib file
pattern = r'@article{cite_\d+,\s*title = {([^}]+)},\s*author = {([^,]+),\s*([^}]+)},\s*journal = {[^}]+},\s*year = {(\d+)},'

# Define the path to the directory containing the bib files
bib_dir = os.path.join(script_dir, 'content', 'publication', 'bib_files')

# Function to clean folder name by converting to lowercase and replacing spaces, single quotes, etc. with underscores
def clean_folder_name(name):
    # Convert to lowercase
    # cleaned_name = name.lower()
    cleaned_name = name
    # Replace spaces, single quotes, and other special characters with underscores
    cleaned_name = re.sub(r'\s+|\'|[^a-zA-Z0-9]', '_', cleaned_name)
    return cleaned_name

# Iterate through the bib files in the directory
for filename in os.listdir(bib_dir):
    if filename.endswith('.bib') and filename.startswith('cite_'):
        bib_path = os.path.join(bib_dir, filename)

        # Read the contents of the bib file
        with open(bib_path, 'r') as file:
            bib_content = file.read()

        # Extract the necessary details from the bib file using regex
        match = re.search(pattern, bib_content)
        if match:
            title = match.group(1)
            author_last_name = match.group(2)
            author_first_name = match.group(3)
            year = match.group(4)

            # Create the folder name using the extracted details and clean it
            folder_name = f'{year}_{clean_folder_name(author_last_name)}_{clean_folder_name(title.split()[0])}'

            # Create the folder in the publications directory
            folder_path = os.path.join(script_dir, 'content', 'publication', folder_name)
            os.makedirs(folder_path, exist_ok=True)

            # Copy and rename the bib file to the newly created folder
            shutil.copy(bib_path, os.path.join(folder_path, 'cite.bib'))


In [20]:
import os
import re
import shutil

# Get the current directory of the Jupyter notebook
notebook_dir = os.getcwd()

# Navigate one folder above
script_dir = os.path.abspath(os.path.join(notebook_dir, '..'))

# Define the path to the directory containing the publication folders
publication_dir = os.path.join(script_dir, 'content', 'publication')

# Define the regular expression patterns to extract information from the bib file
title_pattern = r'title\s*=\s*{([^}]+)}'
author_pattern = r'author\s*=\s*{([^}]+)}'
abstract_pattern = r'abstract\s*=\s*{([^}]+)}'
year_pattern = r'year\s*=\s*{(\d+)}'
journal_pattern = r'journal\s*=\s*{([^}]+)}'
publisher_pattern = r'publisher\s*=\s*{([^}]+)}'
doi_pattern = r'doi\s*=\s*{([^}]+)}'

# Function to format the author names
def format_author_names(authors):
    formatted_authors = []
    for author in authors:
        # Split the author name by " and " to get individual author names
        individual_authors = author.split(' and ')
        for individual_author in individual_authors:
            # Split the individual author name by ", " to extract the surname and first name
            name_parts = individual_author.split(', ')
            if len(name_parts) >= 2:
                surname = name_parts[0].strip()
                first_name = name_parts[1].strip()
                # Concatenate the first name and surname in the desired format
                formatted_name = f"{first_name} {surname}"
                formatted_authors.append(formatted_name)
            else:
                # Handle the case when author name doesn't follow the expected format
                formatted_authors.append(individual_author)
    # Replace the last author with "et al."
    formatted_authors[-1] = "et al."
    return formatted_authors

def create_index_md(bib_path, folder_path):
    # Read the contents of the cite.bib file
    with open(bib_path, 'r') as file:
        bib_content = file.read()

    # Extract the necessary information from the bib file using regex
    title_match = re.search(title_pattern, bib_content)
    author_match = re.search(author_pattern, bib_content)
    abstract_match = re.search(abstract_pattern, bib_content)
    year_match = re.search(year_pattern, bib_content)
    journal_match = re.search(journal_pattern, bib_content)
    publisher_match = re.search(publisher_pattern, bib_content)
    doi_match = re.search(doi_pattern, bib_content)

    # Create the index.md content using the extracted information
    index_content = "---\n"
    index_content += f"title: '{title_match.group(1)}'\n"
    index_content += "subtitle: ''\n"
    index_content += "summary: ''\n"
    index_content += "authors:\n"
    authors = [author.strip() for author in author_match.group(1).split(' and ')]
    formatted_authors = format_author_names(authors)
    for author in formatted_authors:
        index_content += f"- {author}\n"
    index_content += "tags: []\n"
    index_content += "categories:\n"
    
    if (journal_match and "arxiv" in journal_match.group(1).lower()) or (publisher_match and "arxiv" in publisher_match.group(1).lower()):
        index_content += "- Preprints\n"  # Preprints category
        index_content += f"publication: 'arXiv'\n"  # Set publication as 'arXiv' for preprints
        index_content += f"doi: '{doi_match.group(1)}'\n" if doi_match else "doi: ''\n"  # Set DOI from the bib file if available, otherwise leave it empty
    elif (journal_match and "medrxiv" in journal_match.group(1).lower()) or (publisher_match and "medrxiv" in publisher_match.group(1).lower()):
        index_content += "- Preprints\n"  # Preprints category
        index_content += f"publication: 'medRxiv'\n"  # Set publication as 'medRxiv' for preprints
        index_content += f"doi: '{doi_match.group(1)}'\n" if doi_match else "doi: ''\n"  # Set DOI from the bib file if available, otherwise leave it empty
    else:
        index_content += "- journal-articles\n"  # Journal Articles category
        index_content += f"publication: '{journal_match.group(1)}'\n"  # Set publication as the journal name
        index_content += "doi: ''\n"  # Leave DOI empty for journal articles

    index_content += f"date: '{year_match.group(1)}-01-01'\n"
    index_content += f"lastmod: {year_match.group(1)}-01-01T00:00:00Z\n"
    index_content += "featured: false\n"
    index_content += "draft: false\n"
    index_content += "\n"
    index_content += "# Featured image\n"
    index_content += "# To use, add an image named `featured.jpg/png` to your page's folder.\n"
    index_content += "# Focal points: Smart, Center, TopLeft, Top, TopRight, Left, Right, BottomLeft, Bottom, BottomRight.\n"
    index_content += "image:\n"
    index_content += "  caption: ''\n"
    index_content += "  focal_point: 'Smart'\n"
    index_content += "  preview_only: false\n"
    index_content += "\n"
    index_content += "# Projects (optional).\n"
    index_content += "# Associate this post with one or more of your projects.\n"
    index_content += "# Simply enter your project's folder or file name without extension.\n"
    index_content += "# E.g. `projects = ['internal-project']` references `content/project/deep-learning/index.md`.\n"
    index_content += "# Otherwise, set `projects = []`.\n"
    index_content += "projects: []\n"

    # Set the publication types based on the category
    if (journal_match and ("arxiv" in journal_match.group(1).lower() or "medrxiv" in journal_match.group(1).lower())) or \
            (publisher_match and ("arxiv" in publisher_match.group(1).lower() or "medrxiv" in publisher_match.group(1).lower())):
        index_content += "publication_types:\n"
        index_content += "- 3\n"  # Preprints category
    else:
        index_content += "publication_types:\n"
        index_content += "- 2\n"  # Journal Articles category

    index_content += f"publishDate: '{year_match.group(1)}-01-01T00:00:00Z'\n"


    # Process the abstract paragraphs
    abstract_paragraphs = abstract_match.group(1).split('\n\n')  # Split paragraphs by double newline
    index_content += "abstract: |\n"
    # Define the section headings
    section_headings = [
        "Introduction", "Background", "Methods", "Materials and Methods", "Results", "Conclusion",
        "Objectives", "Objective", "Aims", "Purpose", "Materials and Methods",
        "Experimental Design", "Study Design", "Data Collection",
        "Data Analysis", "Statistical Analysis", "Findings",
        "Discussion", "Implications", "Limitations", "Future Directions",
        "Conclusions"
    ]

    # Process the abstract paragraphs
    abstract_paragraphs = abstract_match.group(1).split('\n\n')  # Split paragraphs by double newline
    for i, paragraph in enumerate(abstract_paragraphs):
        # Check if the paragraph starts with a heading in the format "[some heading]:"
        heading_match = re.match(r'^(\s*(?i:' + '|'.join(section_headings) + r')):', paragraph)

        if heading_match:
            heading = heading_match.group(1)
            content = paragraph.replace(heading + ':', '', 1).strip()  # Remove the heading from the paragraph content
            # Escape double quotes in the content
            content = content.replace('"', r'\"')
            # Create the bolded and underlined heading followed by the content
            index_content += f"  **<ins>{heading}:</ins>** {content}\n"
        else:
            # Add the paragraph as regular content
            paragraph = paragraph.strip()  # Remove leading/trailing whitespace
            # Escape double quotes in the paragraph
            paragraph = paragraph.replace('"', r'\"')
            index_content += f"  {paragraph}\n"
        # Add an empty line after each paragraph except the last one
        if i < len(abstract_paragraphs) - 1:
            index_content += "\n"

    index_content += "links:\n"
    index_content += "  - name: Open Access PDF\n"
    index_content += "    url: ''\n"  # Leave the URL empty for Open Access PDF
    index_content += "---"

    # Write the index.md file
    with open(os.path.join(folder_path, 'index.md'), 'w') as file:
        file.write(index_content)

# Iterate through the publication folders
for folder_name in os.listdir(publication_dir):
    folder_path = os.path.join(publication_dir, folder_name)
    if os.path.isdir(folder_path) and re.match(r'\d{4}_', folder_name):
        # Get the path to the cite.bib file within the folder
        bib_path = os.path.join(folder_path, 'cite.bib')

        # Create the index.md file
        create_index_md(bib_path, folder_path)
