This notebook parse another notebook to find all the images linked within the markdown cells of that notebook, so that we can know what to include with that notebook for proper display even if it is moved to another location of the filesystem

In [6]:
import json
import re
import os
from shutil import copy2

# Replace 'your_notebook.ipynb' with the path to your notebook
notebook_path = './Séance4_CM.ipynb'

# Load the notebook JSON structure
with open(notebook_path, 'r', encoding='utf-8') as f:
    notebook = json.load(f)

# Regular expression to find image links in markdown
image_pattern = r'!\[.*?\]\((.*?)\)'

# List to store the found image paths
image_paths = []

# Iterate through the cells
for cell in notebook['cells']:
    # Check if the cell is a markdown cell
    if cell['cell_type'] == 'markdown':
        # Extract the markdown content
        markdown_content = ''.join(cell['source'])
        # Find all image links in the markdown content
        images_in_cell = re.findall(image_pattern, markdown_content)
        if images_in_cell:
            image_paths.extend(images_in_cell)

# Display the list of found image paths
print(f"Found {len(image_paths)} images referenced in markdown cells:")
for img in image_paths:
    print(img)

Found 1 images referenced in markdown cells:
../../../images/regline.png


In [3]:
# Optional: copy these images to a new location, for example, a folder called 'images'
# Thomas: this doesn't work since the links were broken in the first place

new_image_dir = './images'

# Create the directory if it doesn't exist
#if not os.path.exists(new_image_dir):
#    os.makedirs(new_image_dir)

# Function to copy images to the new directory and keep relative paths
def copy_images(image_paths):
    for img_path in image_paths:
        if os.path.exists(img_path):
            img_filename = os.path.basename(img_path)
            new_img_path = os.path.join(new_image_dir, img_filename)
            copy2(img_path, new_img_path)
            print(f"Copied {img_path} to {new_img_path}")
        else:
            print(f"Image path not found: {img_path}")

# Uncomment the following line to copy the images to the 'images' folder
copy_images(image_paths)

Image path not found: .../../images/function_definition.jpg
Image path not found: ../../images/function_execution.jpg
Image path not found: .../../images/C-3PO_droid.png
Image path not found: ../../../images/ipad_battery.png
Image path not found: ../../../images/normal_shaded.png


Going beyond images now

In [10]:
import json
import re
import os
from shutil import copy2

# Replace 'your_notebook.ipynb' with the path to your notebook
notebook_path = './Séance4_CM.ipynb'

# Load the notebook JSON structure
with open(notebook_path, 'r', encoding='utf-8') as f:
    notebook = json.load(f)

# Regular expressions to find file links
# Markdown pattern for images or any file link (e.g., ![alt text](path) or [text](path))
markdown_pattern = r'!\[.*?\]\((.*?)\)|\[.*?\]\((.*?)\)'
# Code pattern for file paths within strings (e.g., 'data/file.csv', "assets/style.css")
code_pattern = r'["\'](.*?)["\']'

# Lists to store found assets
markdown_assets = []
code_assets = []

# Iterate through the cells to find markdown and code cells
for cell in notebook['cells']:
    if cell['cell_type'] == 'markdown':
        # Extract the markdown content
        markdown_content = ''.join(cell['source'])
        # Find all asset links in the markdown content (images, links, etc.)
        assets_in_markdown = re.findall(markdown_pattern, markdown_content)
        # Filter empty strings from the results and flatten the tuple
        for asset in assets_in_markdown:
            asset = [x for x in asset if x]  # Remove empty strings
            if asset:
                markdown_assets.extend(asset)

    elif cell['cell_type'] == 'code':
        # Extract the code content
        code_content = ''.join(cell['source'])
        # Find all file paths in code cells (e.g., within load commands, string literals)
        assets_in_code = re.findall(code_pattern, code_content)
        # Check if the found string seems like a file path (this is a naive check)
        for asset in assets_in_code:
            if os.path.exists(asset):
                code_assets.append(asset)

# Combine all assets
all_assets = markdown_assets + code_assets

# Remove duplicates
all_assets = list(set(all_assets))

# Display the found assets
print(f"Found {len(all_assets)} linked assets:")
for asset in all_assets:
    print(asset)

# Optional: copy these assets to a new location (e.g., an 'assets' directory)
new_asset_dir = 'assets'

# Create the directory if it doesn't exist
#if not os.path.exists(new_asset_dir):
#    os.makedirs(new_asset_dir)

# Function to copy assets to the new directory and keep relative paths
def copy_assets(asset_paths):
    for asset_path in asset_paths:
        if os.path.exists(asset_path):
            asset_filename = os.path.basename(asset_path)
            new_asset_path = os.path.join(new_asset_dir, asset_filename)
            copy2(asset_path, new_asset_path)
            print(f"Copied {asset_path} to {new_asset_path}")
        else:
            print(f"Asset path not found: {asset_path}")

# Uncomment the following line to copy the assets to the 'assets' folder
# copy_assets(all_assets)

Found 11 linked assets:
http://www.stat.ufl.edu/~winner/datasets.html
../../../images/regline.png
https://commons.wikimedia.org/wiki/File:Dugong_dugon.jpg
http://digitalcommons.wku.edu/ijes/vol6/iss2/10/
https://carpentries-incubator.github.io/high-dimensional-analysis-in-python
http://www.statsci.org/data/oz/dugongs.html
http://blogs.scientificamerican.com/the-curious-wavefunction/chocolate-consumption-and-nobel-prizes-a-bizarre-juxtaposition-if-there-ever-was-one/
http://www.reuters.com/article/2012/10/10/us-eat-chocolate-win-the-nobel-prize-idUSBRE8991MS20121010#vFdfFkbPVlilSjsB.97
https://www.data8.org
http://www.stat.ufl.edu/%7Ewinner/
http://www.biostat.jhsph.edu/courses/bio621/misc/Chocolate%20consumption%20cognitive%20function%20and%20nobel%20laurates%20%28NEJM%29.pdf
