In [1]:
import os
import re
from string import ascii_uppercase, ascii_lowercase
from typing import List, Dict, Tuple

DATA_ROOT = '/datasets/PMC-15M'
SAMPLE_DATA = f'{DATA_ROOT}/0.jsonl'

In [31]:
from exsclaim.tool import CaptionDistributor
from exsclaim import caption

def split_caption(caption_text):
    # Create a minimal search query
    search_query = {
        "name": "caption_split",
        "query": {},  # Empty query since we're not searching for specific keywords
        "logging": []  # No logging for simplicity
    }

    # Initialize the CaptionDistributor
    distributor = CaptionDistributor(search_query)

    # Load the model
    model = distributor._load_model()

    # Find the delimiter
    delimiter = caption.find_subfigure_delimiter(model, caption_text)

    # Get the subfigure tokens
    subfigure_tokens = caption.get_subfigure_tokens(model, caption_text)

    # Get the caption dictionary
    caption_dict = caption.associate_caption_text(model, caption_text, search_query['query'])

    return delimiter, subfigure_tokens, caption_dict

# Usage
caption_text = "Your caption text here. (A-B) First part. (C) Second part. (D) Third part."
delimiter, subfigure_tokens, caption_dict = split_caption(caption_text)

caption_dict



{'A': {'description': [], 'keywords': [], 'general': []},
 'B': {'description': ['First part.'], 'keywords': [], 'general': []},
 'C': {'description': ['Second part.'], 'keywords': [], 'general': []},
 'D': {'description': ['Third part.'], 'keywords': [], 'general': []}}

In [3]:
# Load sample jsonl file
with open(SAMPLE_DATA, 'r') as f:
    dataset = [eval(line) for line in f]

print(len(dataset)); dataset[23]

9568


{'PMC_ID': 'PMC193607',
 'media_id': 'pbio-0000031-g001',
 'caption': 'Electron micrograph of Proteobacteria in eukaryotic cell',
 'media_url': 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC193607/bin/pbio.0000031.g001.jpg',
 'media_name': 'PMC193607_pbio-0000031-g001.jpg'}

In [12]:
count = 0

for i, data in enumerate(dataset[:]):

    if '(A)' in data['caption']:
        print(i, '\n', data['caption'])#; break
        count += 1

print(f'\n{count/len(dataset)*100:.2f}%')

0 
 Parasite Culturing and Data Characteristics of the P. falciparum IDC Transcriptome Analysis(A) Giemsa stains of the major morphological stages throughout the IDC are shown with the percent representation of ring-, trophozoite-, or schizont-stage parasites at every timepoint. The 2-h invasion window during the initiation of the bioreactor culture is indicated (gray area).(B–D) Example expression profiles for three genes, encoding EBA175, DHFR-TS, and ASL, are shown with a loess fit of the data (red line).(E) MAL6P1.147, the largest predicted ORF in the Plasmodium genome, is represented by 14 unique DNA oligonucleotide elements. The location of each of the oligonucleotide elements within the predicted ORF and the corresponding individual expression profiles are indicated (oligo 1–14). A red/green colorimetric representation of the gene expression ratios for each oligonucleotide is shown below the graph. The pairwise Pearson correlation for these expression profiles is 0.98 ± 0.02.(F)

In [24]:
def split_captions(dataset):
    """
    Split figure captions into subcaptions.

    Args:
    dataset (List[Dict[str, str]]): List of dictionaries containing figure captions.

    Returns:
    Dict[str, List[Dict[str, str]]]: Dictionary mapping PMC_ID to a list of sub-figure labels and captions.
    """
    def get_pattern(text):
        patterns = [
            ('parenthesis', r'\([aA]'),
            ('letter_dot', r'[aA]\.'),
            ('letter_paren', r'[aA]\)'),
            ('panel', r'Panel [aA]'),
        ]
        for pattern_name, regex in patterns:
            if re.search(regex, text):
                return pattern_name
        return None

    def get_pattern_specifics(pattern):
        if pattern == 'parenthesis':
            return {
                'regex': lambda letter: rf'\({letter}',
                'merged_regex': lambda letter: rf'\({letter}[^a-zA-Z]([a-zA-Z])\)',
                'extract_label': lambda text: text.strip('()'),
                'find_end': lambda text, start: start + text[start:].index(')') + 1
            }
        elif pattern == 'letter_dot':
            return {
                'regex': lambda letter: rf'{letter}\.',
                'merged_regex': lambda letter: rf'{letter}[^a-zA-Z]([a-zA-Z])\.',
                'extract_label': lambda text: text.rstrip('.'),
                'find_end': lambda text, start: start + 2
            }
        elif pattern == 'panel':
            return {
                'regex': lambda letter: rf'Panel {letter}',
                'merged_regex': lambda letter: rf'Panel {letter}[^a-zA-Z]([a-zA-Z])',
                'extract_label': lambda text: re.search(r'[Pp]anel ([a-zA-Z])', text).group(1),
                'find_end': lambda text, start: start + len(re.match(r'[Pp]anel [a-zA-Z]', text[start:]).group())
            }
        elif pattern == 'letter_paren':
            return {
                'regex': lambda letter: rf'{letter}\)',
                'merged_regex': lambda letter: rf'{letter}[^a-zA-Z]([a-zA-Z])\)',
                'extract_label': lambda text: text.rstrip(')'),
                'find_end': lambda text, start: start + 2
            }
        return None

    def process_single_caption(full_caption):
        """
        Process a single caption, splitting it into main caption and subcaptions.

        Args:
        full_caption (str): The full caption text to process.

        Returns:
        List[Dict[str, str]]: List of dictionaries containing labels and captions.
        """
        subcaptions = []
        pattern_name = get_pattern(full_caption)
        
        if not pattern_name:
            return [{"label": "main", "caption": full_caption.strip()}]
        
        pattern = get_pattern_specifics(pattern_name)
        first_subcaption_match = re.search(pattern['regex']('[aA]'), full_caption)
        
        if not first_subcaption_match:
            return [{"label": "main", "caption": full_caption.strip()}]
        
        main_caption = full_caption[:first_subcaption_match.start()].strip()
        
        remaining_text = full_caption[first_subcaption_match.start():]
        current_subcaption_letter = pattern['extract_label'](first_subcaption_match.group())
        
        while current_subcaption_letter in (ascii_uppercase + ascii_lowercase):
            subcaption_start_match = re.search(pattern['regex'](current_subcaption_letter), remaining_text)
            if not subcaption_start_match:
                break
            
            subcaption_start = subcaption_start_match.start()
            merged_subcaption_match = re.match(pattern['merged_regex'](current_subcaption_letter), remaining_text[subcaption_start:])
            
            if merged_subcaption_match:
                subcaption_label = pattern['extract_label'](remaining_text[subcaption_start:subcaption_start+merged_subcaption_match.end()])
                end_letter = merged_subcaption_match.group(1)
                subcaption_end = subcaption_start + merged_subcaption_match.end()
            else:
                subcaption_label = pattern['extract_label'](remaining_text[subcaption_start:pattern['find_end'](remaining_text, subcaption_start)])
                end_letter = current_subcaption_letter
                subcaption_end = pattern['find_end'](remaining_text, subcaption_start)
            
            subcaption_text = remaining_text[subcaption_end:].strip()
            
            next_letter_index = (ascii_uppercase + ascii_lowercase).index(end_letter) + 1
            if next_letter_index < len(ascii_uppercase + ascii_lowercase):
                next_subcaption_letter = (ascii_uppercase + ascii_lowercase)[next_letter_index]
                next_subcaption_match = re.search(pattern['regex'](next_subcaption_letter), subcaption_text)
                if next_subcaption_match:
                    subcaption_text = subcaption_text[:next_subcaption_match.start()].strip()
            else:
                next_subcaption_letter = None
            
            subcaptions.append({"label": subcaption_label, "caption": subcaption_text})
            
            remaining_text = remaining_text[subcaption_end:]  # Remove the leading space
            current_subcaption_letter = next_subcaption_letter
        
        if main_caption:
            subcaptions.insert(0, {"label": "main", "caption": main_caption})
        
        return subcaptions

    result = {}
    for item in dataset:
        pmc_id = item['PMC_ID']
        caption = item['caption']
        result[pmc_id] = process_single_caption(caption)
    
    return result

In [25]:
### TEST CODE ###
sample = 5255       #116 + 1002 #2575 #2100+3821 #2100+1015 #6125 #9476 #123 #4721 #2512 #5555
result = split_captions(dataset[sample:sample+1])

# Print the result
for i, (pmc_id, captions) in enumerate(result.items()):
    print(f"Original:\n{dataset[i+sample]['caption'].strip()}\n\nProcessed:")
    for caption in captions:
        print(f"label: {caption['label']} | caption: {caption['caption']}")
    print('\n', "="*100, '\n')

Original:
Electromyographic activity of the crural diaphragm a) at rest and b) on slow sustained straining. ↑ = straining

Processed:
label: main | caption: Electromyographic activity of the crural diaphragm
label: a | caption: at rest and
label: b | caption: on slow sustained straining. ↑ = straining


