In [21]:
import re
import yaml
import json
from tqdm import tqdm  

def clean_mdx(file_path):
    tags_to_remove = ["ButtonContainer", "NextStepsCard"]

    with open(file_path, 'r', encoding="utf8") as file:
        content = file.read()

    # Extract the YAML header/front matter
    yaml_header = re.search(r'^---\n(.*?)\n---', content, re.DOTALL)
    if yaml_header:
        yaml_content = yaml_header.group(1)
        header_dict = yaml.safe_load(yaml_content)
    else:
        header_dict = {}

    content = re.sub(r'^---\n.*?\n---', '', content, flags=re.DOTALL)

    # Extract PageContent
    page_content_match = re.search(r'<PageContent.*?>(.*?)</PageContent>', content, re.DOTALL)
    if page_content_match:
        page_content = page_content_match.group(1)
    else:
        page_content = ""

    # Extract FloatingContentWrapper
    floating_content_match = re.search(r'<FloatingContentWrapper.*?>(.*?)</FloatingContentWrapper>', page_content, flags=re.DOTALL)
    floating_content = floating_content_match.group(1) if floating_content_match else ""

    # Remove FloatingContentWrapper from original position
    page_content = re.sub(r'<FloatingContentWrapper.*?</FloatingContentWrapper>', '', page_content, flags=re.DOTALL)

    # Remove tags_to_remove
    for tag in tags_to_remove:
        page_content = re.sub(f'<{tag}.*?>.*?</{tag}>', '', page_content, flags=re.DOTALL)

    # <Link> tags to markdown links
    page_content = re.sub(r'<Link to="([^"]*)"[^>]*>(.*?)</Link>', r'[\2](\1)', page_content)

    # Remove all remaining JSX/JS parts
    cleaned_content = re.sub(r'<.*?>', '', page_content, flags=re.DOTALL)

    # Insert FloatingContentWrapper content after the first h2 section
    sections = re.split(r'\n##\s', cleaned_content)
    insert_index = 2
    if len(sections) > 1:
        sections[insert_index] = sections[insert_index] + "\n\n" + floating_content.strip() + "\n\n"
    cleaned_content = '## '.join(sections)

    return header_dict, cleaned_content.strip()

# Example usage
file_path = 'C:/!Work/interim-bdc-website/src/pages/use-bdc/analyze-data/index.mdx'
header, markdown_content = clean_mdx(file_path)

# print("Header as dict:")
# print(json.dumps(header, indent=2))
# print("\nExtracted Markdown Content:")
# print(markdown_content)

In [22]:
pages_dir = 'C:/!Work/interim-bdc-website/src/pages/'
page_dir_paths = ["use-bdc/analyze-data/", ]
page_file_paths = ["join-bdc/index.mdx", "use-bdc/share-data.mdx", 
                   "user-resources/terms-of-use.mdx", "user-resources/usage-costs.mdx", "user-resources/usage-terms.mdx",
                   "about/key-collaborations.mdx", "about/overview.mdx", "about/research-communities.mdx",
                   "use-bdc/explore-data/index.mdx"]

def get_all_mdx_paths(pages_dir, page_dir_paths, page_file_paths):
    import os
    all_paths = []
    
    # 1. Process directory paths
    for dir_path in page_dir_paths:
        full_dir_path = os.path.join(pages_dir, dir_path)
        if os.path.exists(full_dir_path):
            # Get immediate files in the directory
            for file in os.listdir(full_dir_path):
                if file.endswith('.mdx'):
                    all_paths.append(os.path.join(full_dir_path, file))
    
    # 2. Process individual file paths
    for file_path in page_file_paths:
        full_file_path = os.path.join(pages_dir, file_path)
        if os.path.exists(full_file_path):
            all_paths.append(full_file_path)
    
    return all_paths

# Usage example:
mdx_paths = get_all_mdx_paths(pages_dir, page_dir_paths, page_file_paths)
for i, path in enumerate(mdx_paths):
    print(f"{i+1}: {path}")
    


1: C:/!Work/interim-bdc-website/src/pages/use-bdc/analyze-data/bdc-workspaces.mdx
2: C:/!Work/interim-bdc-website/src/pages/use-bdc/analyze-data/index.mdx
3: C:/!Work/interim-bdc-website/src/pages/join-bdc/index.mdx
4: C:/!Work/interim-bdc-website/src/pages/use-bdc/share-data.mdx
5: C:/!Work/interim-bdc-website/src/pages/user-resources/terms-of-use.mdx
6: C:/!Work/interim-bdc-website/src/pages/user-resources/usage-costs.mdx
7: C:/!Work/interim-bdc-website/src/pages/user-resources/usage-terms.mdx
8: C:/!Work/interim-bdc-website/src/pages/about/key-collaborations.mdx
9: C:/!Work/interim-bdc-website/src/pages/about/overview.mdx
10: C:/!Work/interim-bdc-website/src/pages/about/research-communities.mdx
11: C:/!Work/interim-bdc-website/src/pages/use-bdc/explore-data/index.mdx


In [23]:
metadata_list = []
page_content_list = []

for path in tqdm(mdx_paths):
    header, page_content = clean_mdx(path)
    metadata_list.append(header)
    page_content_list.append(page_content)


100%|██████████| 11/11 [00:00<00:00, 358.61it/s]


In [30]:
for i, content in enumerate(page_content_list):
    print(f"{i+1}: \t{mdx_paths[i]}\n")
    print(content)
    print("\n\n")



1: 	C:/!Work/interim-bdc-website/src/pages/use-bdc/analyze-data/bdc-workspaces.mdx

The term "BDC’s workspaces" refers *to BDC Powered by Seven Bridges*.
While it is not the primary analysis platform in BDC, users may opt
to use Terra. Use the following chart to help determine which analysis
platform works best for your needs.


  
    
    *BDC Powered by Seven Bridges*
    *BDC Powered by Terra*
  
  
    
      **Cloud Compute Environments**
    
    
      - Google 
      - AWS
      - Azure
    
    
      - Google 
      - Azure
    
  
  
    
      **Workflow Languages**

      Analyze data in batches using large-scale, non-interactive analysis workflows leveraging bioinformatics tools and AI/ML* libraries. 
    
    
      - CWL (1.0 and 1.2)
      - [Nextflow](https://docs.sevenbridges.com/docs/bring-nextflow-apps-to-the-platform) 
      - [WDL](https://docs.sevenbridges.com/docs/bring-wdl-apps-to-the-platform)    
    
    
      - WDL 
    
  
  
    
      **Interactive An

In [32]:
for i, metadata in enumerate(metadata_list):
    print(f"{i+1}: {mdx_paths[i]}")
    print(json.dumps(metadata, indent=2))
    print("\n\n")



1: C:/!Work/interim-bdc-website/src/pages/use-bdc/analyze-data/bdc-workspaces.mdx
{}



2: C:/!Work/interim-bdc-website/src/pages/use-bdc/analyze-data/index.mdx
{
  "menu": [
    {
      "heading": "About Analyzing Data in BDC",
      "href": "about-analyzing-data-in-bdc"
    },
    {
      "heading": "Prepare to Analyze Data",
      "href": "prepare-to-analyze-data"
    },
    {
      "heading": "Build a Cohort for Analysis (Optional)",
      "href": "build-a-cohort-for-analysis-optional"
    },
    {
      "heading": "Import Data to a BDC Workspace",
      "href": "import-data-to-a-bdc-workspace"
    },
    {
      "heading": "Utilize Analysis Tools",
      "href": "utilize-analysis-tools"
    },
    {
      "heading": "Bring Your Own Data (BYOD)",
      "href": "bring-your-own-data-byod"
    },
    {
      "heading": "Terra Users",
      "href": "terra-users"
    }
  ]
}



3: C:/!Work/interim-bdc-website/src/pages/join-bdc/index.mdx
{}



4: C:/!Work/interim-bdc-website/src/pages/u