In [9]:
# add extract figure discription
import re
import json
import shutil
import tarfile
import tempfile
from pathlib import Path
from typing import List, Dict, Optional, Tuple

def extract_tarfile(tar_path: str) -> str:
    """Extract tar.gz file to temporary directory."""
    temp_dir = tempfile.mkdtemp()
    try:
        with tarfile.open(tar_path, 'r:gz') as tar:
            # Security check for path traversal
            def is_safe_path(path):
                return not (path.startswith('/') or '..' in path)
            
            members = [m for m in tar.getmembers() if is_safe_path(m.name)]
            tar.extractall(path=temp_dir, members=members)
    except tarfile.TarError as e:
        shutil.rmtree(temp_dir, ignore_errors=True)
        raise Exception(f"Failed to extract tar file: {e}")
    return temp_dir

def find_main_tex_file(folder: str) -> str:
    """Find the main LaTeX file, prioritizing common main file names."""
    # Common main file names (prioritized)
    priority_names = ['main.tex', 'paper.tex', 'article.tex', 'manuscript.tex']
    
    # First, look for priority names
    for priority in priority_names:
        for root, _, files in os.walk(folder):
            if priority in files:
                return os.path.join(root, priority)
    
    # If no priority files found, return first .tex file
    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith('.tex'):
                return os.path.join(root, file)
    
    raise FileNotFoundError(f"No .tex file found in: {folder}")
    
def extract_figure_descriptions(tex_code: str) -> Dict[str, str]:
    """
    Improved: Extract figure descriptions near \includegraphics using larger context and paragraph heuristics.
    """
    figure_descriptions = {}
    pattern = r"(\\begin\{figure\}.*?\\includegraphics(?:\[[^\]]*\])?\{(.+?)\}.*?\\end\{figure\})"

    for match in re.finditer(pattern, tex_code, re.S):
        full_block = match.group(1)
        fig_path = match.group(2).strip()

        # Expand context window
        start = max(0, match.start() - 5000)
        end = match.end() + 5000
        context = tex_code[start:end]

        # Look for paragraphs mentioning "Figure", "Fig.", etc.
        paragraphs = re.split(r"\n\s*\n", context)
        candidate_paragraphs = [
            p.strip()
            for p in paragraphs
            if (
                "figure" in p.lower() or "fig." in p.lower()
            ) and len(p.strip()) > 40 and "\\includegraphics" not in p
        ]

        # If none found, fall back to any nearby paragraph
        if not candidate_paragraphs:
            candidate_paragraphs = [
                p.strip()
                for p in paragraphs
                if len(p.strip()) > 40 and "\\includegraphics" not in p
            ]

        # Concatenate and truncate
        full_desc = " ".join(candidate_paragraphs)
        figure_descriptions[fig_path] = full_desc[:1500]

    return figure_descriptions


def extract_sections(tex_code: str) -> Dict[str, str]:
    """Extract methodology and conclusion sections from LaTeX code."""
    sections = {}
    
    # More comprehensive regex patterns
    section_patterns = [
        r'\\section\*?\{(.+?)\}(.*?)(?=\\section|\Z)',
        r'\\subsection\*?\{(.+?)\}(.*?)(?=\\(?:sub)?section|\Z)',
    ]
    
    for pattern in section_patterns:
        matches = re.findall(pattern, tex_code, re.S | re.I)
        for title, content in matches:
            title_lower = title.lower().strip()
            
            # keyword matching
            if any(key in title_lower for key in ['method', 'model', 'approach', 'algorithm', 'technique']):
                if 'methodology' not in sections or len(content.strip()) > len(sections.get('methodology', '')):
                    sections['methodology'] = content.strip()
            elif any(key in title_lower for key in ['conclusion', 'concluding', 'summary', 'discussion']):
                if 'conclusion' not in sections or len(content.strip()) > len(sections.get('conclusion', '')):
                    sections['conclusion'] = content.strip()
            elif any(key in title_lower for key in ['introduction', 'intro']):
                sections['introduction'] = content.strip()
            elif any(key in title_lower for key in ['result', 'experiment', 'evaluation']):
                sections['results'] = content.strip()
    
    return sections

def extract_figures(tex_code: str) -> List[Dict[str, str]]:
    """Extract figure information from LaTeX code."""
    figures = []
    
    # Multiple patterns to catch different figure formats
    patterns = [
        r'\\begin{figure}.*?\\includegraphics(?:\[[^\]]*\])?\{(.+?)\}.*?\\caption\{(.+?)\}.*?\\end{figure}',
        r'\\includegraphics(?:\[[^\]]*\])?\{(.+?)\}.*?\\caption\{(.+?)\}',
        r'\\begin{figure}.*?\\caption\{(.+?)\}.*?\\includegraphics(?:\[[^\]]*\])?\{(.+?)\}.*?\\end{figure}',
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, tex_code, re.S | re.I)
        for match in matches:
            if len(match) == 2:
                img_file, caption = match
                # Handle reversed order for some patterns
                if pattern.endswith('includegraphics(?:\\[[^\\]]*\\])?\\{(.+?)\\}.*?\\\\end{figure}'):
                    caption, img_file = match
                
                figures.append({
                    'image_file': img_file.strip(),
                    'caption': caption.strip()
                })
    
    # Remove duplicates based on image file
    seen = set()
    unique_figures = []
    for fig in figures:
        if fig['image_file'] not in seen:
            seen.add(fig['image_file'])
            unique_figures.append(fig)
    
    return unique_figures

def clean_latex(text: str) -> str:
    """Clean LaTeX commands and formatting from text."""
    if not text:
        return ""
    
    # Remove comments
    text = re.sub(r'%.*$', '', text, flags=re.MULTILINE)
    
    # Remove specific LaTeX commands but keep their content
    text = re.sub(r'\\(?:textbf|textit|emph|texttt)\{([^}]*)\}', r'\1', text)
    text = re.sub(r'\\(?:cite|ref|label)\{[^}]*\}', '', text)
    
    # Remove math environments
    text = re.sub(r'\$+[^$]*\$+', '', text)
    text = re.sub(r'\\begin\{equation\}.*?\\end\{equation\}', '', text, flags=re.S)
    text = re.sub(r'\\begin\{align\}.*?\\end\{align\}', '', text, flags=re.S)
    
    # Remove other LaTeX commands
    text = re.sub(r'\\[a-zA-Z]+\*?\{[^}]*\}', '', text)
    text = re.sub(r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])?', '', text)
    
    # Clean up whitespace
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)
    
    return text.strip()

def find_image_file(base_path: str, image_name: str) -> Optional[str]:
    """Find image file with various extensions and in subdirectories."""
    base_dir = os.path.dirname(base_path)
    
    # Remove extension if present
    img_name_no_ext = os.path.splitext(image_name)[0]
    
    # Common image extensions
    extensions = ['.png', '.jpg', '.jpeg', '.pdf', '.eps', '.svg', '.tif', '.tiff']
    
    # Search locations: same dir, figures/, images/, graphics/
    search_dirs = [
        base_dir,
        os.path.join(base_dir, 'figures'),
        os.path.join(base_dir, 'images'),
        os.path.join(base_dir, 'graphics'),
        os.path.join(base_dir, 'fig'),
    ]
    
    for search_dir in search_dirs:
        if not os.path.exists(search_dir):
            continue
            
        # Try with original name
        for ext in [''] + extensions:
            candidate = os.path.join(search_dir, image_name + ext)
            if os.path.exists(candidate):
                return candidate
        
        # Try without extension + new extension
        for ext in extensions:
            candidate = os.path.join(search_dir, img_name_no_ext + ext)
            if os.path.exists(candidate):
                return candidate
    
    return None

def process_tar_gz(tar_path: str, image_output_dir: str = 'dataset/images') -> List[Dict]:
    """Process a tar.gz file containing a LaTeX paper."""
    output_records = []
    paper_id = Path(tar_path).stem.replace('.tar', '').replace('.gz', '')
    
    print(f"🔍 Processing: {paper_id}")
    
    try:
        # Extract tar file
        tex_folder = extract_tarfile(tar_path)
        os.makedirs(image_output_dir, exist_ok=True)
        
        try:
            # Find and read main tex file
            tex_file = find_main_tex_file(tex_folder)
            print(f"📄 Found TeX file: {os.path.basename(tex_file)}")
            
            with open(tex_file, 'r', encoding='utf-8', errors='ignore') as f:
                tex_code = f.read()
            # Extract figure_descriptions 
            figure_descriptions = extract_figure_descriptions(tex_code)
            # Extract sections and figures
            sections = extract_sections(tex_code)
            figures = extract_figures(tex_code)
            
            print(f"📊 Found {len(figures)} figures, {len(sections)} sections")
            
            # Process each figure
            for i, fig in enumerate(figures):
                image_name = fig['image_file']
                
                # Find the actual image file
                image_path = find_image_file(tex_file, image_name)
                
                if image_path and os.path.exists(image_path):
                    # Copy image to output directory
                    new_img_name = f"{paper_id}_fig{i+1}{Path(image_path).suffix}"
                    dest_path = os.path.join(image_output_dir, new_img_name)
                    shutil.copyfile(image_path, dest_path)
                    img_relative_path = os.path.join('images1', new_img_name)
                    print(f"✅ Copied: {new_img_name}")
                else:
                    print(f"⚠️  Image not found: {image_name}")
                    img_relative_path = None
                
                # Create record
                record = {
                    'paper_id': paper_id,
                    'figure_id': f"fig{i+1}",
                    'figure_path': img_relative_path,
                    'caption': clean_latex(fig['caption']),
                    'figure_description': clean_latex(figure_descriptions.get(image_name, "")),
                    'methodology': clean_latex(sections.get('methodology', '')),
                    'conclusion': clean_latex(sections.get('conclusion', '')),
                    'introduction': clean_latex(sections.get('introduction', '')),
                    'results': clean_latex(sections.get('results', '')),
                }
                
                if img_relative_path is not None:
                    output_records.append(record)

        
        finally:
            # Clean up temporary directory
            shutil.rmtree(tex_folder, ignore_errors=True)
    
    except Exception as e:
        print(f"❌ Error processing {paper_id}: {e}")
        return []
    
    print(f"✅ Processed {paper_id}: {len(output_records)} records")
    return output_records

def save_jsonl(records: List[Dict], output_file: str) -> None:
    """Save records to JSONL format."""
    os.makedirs(os.path.dirname(output_file) or '.', exist_ok=True)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for record in records:
            f.write(json.dumps(record, ensure_ascii=False) + '\n')
    
    print(f"💾 Saved {len(records)} records to {output_file}")

def process_multiple_papers(tar_files: List[str], output_file: str, image_dir: str = 'dataset/images') -> None:
    """Process multiple tar.gz files and save to single JSONL file."""
    all_records = []
    
    for tar_file in tar_files:
        if not os.path.exists(tar_file):
            print(f"⚠️  File not found: {tar_file}")
            continue
        
        records = process_tar_gz(tar_file, image_dir)
        all_records.extend(records)
    
    save_jsonl(all_records, output_file)
    print(f"🎉 Total processed: {len(all_records)} records from {len(tar_files)} papers")

In [10]:
# === Run Batch Process ===(with figure discriptions)
if __name__ == '__main__':
    tar_files = glob('latexfiles/*.tar.gz')
    print(f"📦 Found {len(tar_files)} .tar.gz files")

    all_records = []
    for fpath in tar_files:
        print(f"🚀 Processing {fpath}")
        records = process_tar_gz(fpath, image_output_dir='dataset/images1')
        print(f"✅ Extracted {len(records)} figures")
        all_records.extend(records)

    print(f"📝 Saving {len(all_records)} total records...")
    save_jsonl(all_records, 'dataset/finance_dataset5.jsonl')
    print("🎉 All done.")

📦 Found 2 .tar.gz files
🚀 Processing latexfiles/arXiv-2501.00034v1.tar.gz
🔍 Processing: arXiv-2501.00034v1
📄 Found TeX file: main.tex
📊 Found 3 figures, 4 sections
⚠️  Image not found: figures/inference
✅ Copied: arXiv-2501.00034v1_fig2.png
✅ Copied: arXiv-2501.00034v1_fig3.png
✅ Processed arXiv-2501.00034v1: 2 records
✅ Extracted 2 figures
🚀 Processing latexfiles/arXiv-2506.02796v1.tar.gz
🔍 Processing: arXiv-2506.02796v1
📄 Found TeX file: LSTM_BEKK_v2.tex
📊 Found 4 figures, 4 sections
✅ Copied: arXiv-2506.02796v1_fig1.png
✅ Copied: arXiv-2506.02796v1_fig2.png
✅ Copied: arXiv-2506.02796v1_fig3.png
✅ Copied: arXiv-2506.02796v1_fig4.png
✅ Processed arXiv-2506.02796v1: 4 records
✅ Extracted 4 figures
📝 Saving 6 total records...
💾 Saved 6 records to dataset/finance_dataset5.jsonl
🎉 All done.
