## Fellows

In [8]:
import re
import yaml
import json

import glob
import os
    
from tqdm import tqdm  

from pprint import pprint   

from pathlib import Path
data_dir ="../interim-bdc-website/src/data/"

fellow_dir = Path(data_dir + "fellows/").resolve()

def parse_fellow_file(file_path):
    """Parse a single fellow markdown file and return the data as a dictionary."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            # Remove YAML delimiters
            yaml_content = content.replace('---\n', '', 1)  # Remove first delimiter
            yaml_content = yaml_content.rsplit('---', 1)[0]  # Remove last delimiter
            
            fellow_data = yaml.safe_load(yaml_content)
            fellow_data["file_path"] = file_path
            return fellow_data
    except yaml.YAMLError as e:
        print(f"Error parsing {file_path}: {e}")
        return None

def get_fellows(fellow_dir):
    fellows = []
    # Get all .md files in the directory
    md_files = glob.glob(os.path.join(fellow_dir, "*.md"))
    
    for file_path in tqdm(md_files, desc="Reading fellow files"):
        fellow_data = parse_fellow_file(file_path)
        if fellow_data:
            fellows.append(fellow_data)
                
    return fellows

fellows = get_fellows(fellow_dir)


Reading fellow files: 100%|██████████| 47/47 [00:00<00:00, 821.02it/s]


In [2]:
for i, fellow in enumerate(fellows):
    print(f"{i+1}: {json.dumps(fellow, indent=2)}\n")



## Latest Updates


In [3]:
updates_dir = data_dir + "latest-updates/"

In [4]:
def parse_mdx_file(file_path):
    """Parse a MDX file with YAML frontmatter and markdown content.
    
    Args:
        file_path (str): Path to the MDX file
        
    Returns:
        tuple: (metadata_dict, markdown_content)
            - metadata_dict: Dictionary containing the YAML frontmatter data
            - markdown_content: String containing the markdown content
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            
        # Split content into YAML and markdown parts
        parts = content.split('---\n', 2)
        if len(parts) < 3:
            raise ValueError("File does not contain valid YAML frontmatter")
            
        # Parse YAML header (middle part between --- delimiters)
        metadata = yaml.safe_load(parts[1])
        
        # Get markdown content (everything after second ---)
        markdown_content = parts[2].strip()
        
        # Add file path to metadata
        metadata['file_path'] = file_path
        
        return metadata, markdown_content
        
    except (yaml.YAMLError, ValueError) as e:
        print(f"Error parsing {file_path}: {e}")
        return None, None

metadata, content = parse_mdx_file('C:/!Work/interim-bdc-website/src/data/latest-updates/2021-08-15_welcome/index.mdx')
if metadata:
    print("Metadata:", metadata)
    print("\nContent:", content)

FileNotFoundError: [Errno 2] No such file or directory: 'C:/!Work/interim-bdc-website/src/data/latest-updates/2021-08-15_welcome/index.mdx'

In [19]:
def get_mdx_files(updates_dir):
    """Get all MDX files directly under the directory and index.mdx from subdirectories.
    
    Args:
        updates_dir (str): Path to the directory containing MDX files
        
    Returns:
        tuple: (paths, metadata_list, content_list)
            - paths: List of paths to MDX files
            - metadata_list: List of metadata dictionaries
            - content_list: List of markdown content strings
    """
    paths = []
    metadata_list = []
    content_list = []
    
    # Get MDX files directly under the directory
    direct_mdx = glob.glob(os.path.join(updates_dir, "*.mdx"))
    
    # Get index.mdx files from subdirectories
    subdir_mdx = glob.glob(os.path.join(updates_dir, "**/index.mdx"))
    
    # Combine all file paths
    all_mdx_paths = direct_mdx + subdir_mdx
    
    # Parse each file
    for file_path in tqdm(all_mdx_paths, desc="Parsing MDX files"):
        metadata, content = parse_mdx_file(file_path)
        if metadata and content:  # Only add if parsing was successful
            paths.append(file_path)
            metadata_list.append(metadata)
            content_list.append(content)
    
    return paths, metadata_list, content_list

In [20]:

paths, metadata, contents = get_mdx_files(updates_dir)


print(f"Found {len(paths)} MDX files")


for i in range(len(paths)):
    print(f"\nFile {i+1}: {paths[i]}")
    print("Metadata:", metadata[i])
    print("Content preview:", contents[i][:100], "...")

Parsing MDX files: 100%|██████████| 7/7 [00:00<00:00, 294.20it/s]

Found 7 MDX files

File 1: C:/!Work/interim-bdc-website/src/data/latest-updates\2021-08-15_welcome\index.mdx
Metadata: {'path': '/latest-updates/NHLBI-BioData-Catalyst-welcomes-all-researchers', 'date': datetime.date(2021, 7, 20), 'title': 'NHLBI BioData Catalyst welcomes all researchers', 'subtitle': 'Cloud-based ecosystem provides tools, applications, and workflows in secure workspaces for the heart, lung, blood, and sleep research community.', 'tags': ['news'], 'seo': {'title': 'NHLBI BioData Catalyst welcomes all researchers', 'description': 'Cloud-based ecosystem provides tools, applications, and workflows in secure workspaces for the heart, lung, blood, and sleep research community. ', 'keywords': ['']}, 'file_path': 'C:/!Work/interim-bdc-website/src/data/latest-updates\\2021-08-15_welcome\\index.mdx'}
Content preview: The NHLBI BioData Catalyst announced today that its ecosystem, which has undergone rigorous pilot te ...

File 2: C:/!Work/interim-bdc-website/src/data/latest-upda




## events




In [28]:
events_dir = data_dir + "events/"

paths, metadata, contents = get_mdx_files(events_dir)

# Print summary
print(f"Found {len(paths)} MDX files")

# Print details of first few files
for i in range(len(paths)):
    print(f"\nFile {i+1}: {paths[i]}")
    print("Metadata:")
    pprint(metadata[i])
    print("\nContent preview:", contents[i][:100], "...\n")

Parsing MDX files: 100%|██████████| 46/46 [00:00<00:00, 467.24it/s]

Found 46 MDX files

File 1: C:/!Work/interim-bdc-website/src/data/events\2021-08-11_Community-Hours-Showcase.mdx
Metadata:
{'date': datetime.date(2021, 8, 11),
 'display_date': 'August 11, 2021',
 'file_path': 'C:/!Work/interim-bdc-website/src/data/events\\2021-08-11_Community-Hours-Showcase.mdx',
 'forum_post': 'https://bdcatalyst.freshdesk.com/support/discussions/topics/60000406468',
 'location': '',
 'path': '/events/2022-08-11/community_hours',
 'seo': {'description': None,
         'keywords': ['community hours', 'data access', 'data exploration'],
         'title': 'NHLBI BioData Catalyst Community Hours: Exploring and '
                  'Accessing Data'},
 'tags': ['community hours', 'data access', 'data exploration'],
 'time': '1:00 - 2:00 pm EDT',
 'title': 'NHLBI BioData Catalyst Community Hours: Exploring and Accessing '
          'Data',
 'url': 'https://bit.ly/3ijOkr9'}

Content preview: Join us on Wednesday, August 11th at 1 pm EDT for an informal, [virtual event](https:


