In [None]:
import re
import json
import os
from typing import List, Dict
from pathlib import Path


class LatexTripletExtractor:
    """Extract triplet data from LaTeX file for PhysBERT fine-tuning"""

    def __init__(self, config_path: str = "config/default.yaml"):
        self.config_path = config_path
        self.triplets = []

    def extract_triplets_from_latex(self, latex_file_path: str) -> List[Dict]:
        """
        Extract query/positive/negative triplets from LaTeX file
        
        Args:
            latex_file_path: Path to the LaTeX file
            
        Returns:
            List of triplet dictionaries
        """
        try:
            with open(latex_file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Pattern to match complete triplet items
            item_pattern = r'\\item\s+\\textbf\{Query\}:\s*(.*?)(?=\\item\s+\\textbf\{Query\}:|\\end\{enumerate\}|\\newpage)'

            items = re.findall(item_pattern, content, re.DOTALL)

            for item_content in items:
                triplet = self._parse_single_triplet(item_content)
                if triplet and self._validate_triplet(triplet):
                    self.triplets.append(triplet)

            print(f"✅ Successfully extracted {len(self.triplets)} triplets")
            return self.triplets

        except FileNotFoundError:
            print(f"❌ LaTeX file not found: {latex_file_path}")
            return []
        except Exception as e:
            print(f"❌ Error extracting triplets: {str(e)}")
            return []

    def _parse_single_triplet(self, item_content: str) -> Dict:
        """Parse a single triplet from item content"""
        triplet = {}

        # Extract query (text before first itemize)
        query_match = re.search(
            r'^(.*?)(?=\\begin\{itemize\})', item_content, re.DOTALL)
        if query_match:
            triplet['query'] = self._clean_text(query_match.group(1))

        # Extract positive hit
        positive_pattern = r'\\item\s+\\textbf\{Positive Hit\}:\s*(.*?)(?=\\item\s+\\textbf\{Negative Hit\}:|\\end\{itemize\})'
        positive_match = re.search(positive_pattern, item_content, re.DOTALL)
        if positive_match:
            positive_text = positive_match.group(1)
            # Remove nested itemize blocks (variable definitions)
            positive_text = re.sub(
                r'\\begin\{itemize\}.*?\\end\{itemize\}', '', positive_text, flags=re.DOTALL)
            triplet['positive'] = self._clean_text(positive_text)

        # Extract negative hit(s) - handle multiple negative hits
        negative_pattern = r'\\item\s+\\textbf\{Negative Hit(?:\s*\([^)]*\))?\}:\s*(.*?)(?=\\item\s+\\textbf\{Negative Hit\}|\\end\{itemize\})'
        negative_matches = re.findall(
            negative_pattern, item_content, re.DOTALL)

        if negative_matches:
            # Use the first negative hit, or combine multiple ones
            if len(negative_matches) == 1:
                triplet['negative'] = self._clean_text(negative_matches[0])
            else:
                # Combine multiple negative hits
                combined_negative = " ".join(
                    [self._clean_text(neg) for neg in negative_matches])
                triplet['negative'] = combined_negative

        return triplet

    def _clean_text(self, text: str) -> str:
        """Clean LaTeX formatting from text"""
        if not text:
            return ""

        # Remove LaTeX commands and formatting
        text = re.sub(r'\\textbf\{([^}]*)\}', r'\1', text)  # Bold text
        text = re.sub(r'\\text\{([^}]*)\}', r'\1', text)   # Text commands
        # Other commands
        text = re.sub(r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])?\{[^}]*\}', '', text)
        text = re.sub(r'\\[a-zA-Z]+\*?', '', text)  # Simple commands
        text = re.sub(r'\$([^$]*)\$', r'\1', text)  # Inline math
        text = re.sub(r'\s+', ' ', text)  # Multiple spaces
        text = text.strip()

        return text

    def _validate_triplet(self, triplet: Dict) -> bool:
        """Validate that triplet has all required fields"""
        required_fields = ['query', 'positive', 'negative']

        for field in required_fields:
            if field not in triplet or not triplet[field].strip():
                return False

        # Ensure minimum length for meaningful training
        if len(triplet['query']) < 10:
            return False
        if len(triplet['positive']) < 20:
            return False
        if len(triplet['negative']) < 20:
            return False

        return True

    def save_to_json(self, output_path: str = "data/physics_triplets.json") -> bool:
        """Save extracted triplets to JSON file"""
        try:
            # Ensure output directory exists
            os.makedirs(os.path.dirname(output_path), exist_ok=True)

            # Structure data for training
            training_data = {
                "metadata": {
                    "total_triplets": len(self.triplets),
                    "source": "Physics textbook chapter - Particle interactions",
                    "format": "triplet",
                    "fields": ["query", "positive", "negative"]
                },
                "triplets": self.triplets
            }

            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(training_data, f, indent=2, ensure_ascii=False)

            print(f"✅ Saved {len(self.triplets)} triplets to {output_path}")
            return True

        except Exception as e:
            print(f"❌ Error saving to JSON: {str(e)}")
            return False

    def save_for_sentence_transformers(self, output_path: str = "data/sentence_transformers_format.json") -> bool:
        """Save in format optimized for SentenceTransformers"""
        try:
            # Format for SentenceTransformers InputExample
            st_format = []
            for i, triplet in enumerate(self.triplets):
                st_format.append({
                    "id": f"physics_triplet_{i}",
                    "texts": [
                        triplet["query"],
                        triplet["positive"],
                        triplet["negative"]
                    ],
                    "label": 1  # Positive pair label for triplet loss
                })

            os.makedirs(os.path.dirname(output_path), exist_ok=True)

            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(st_format, f, indent=2, ensure_ascii=False)

            print(f"✅ Saved SentenceTransformers format to {output_path}")
            return True

        except Exception as e:
            print(f"❌ Error saving SentenceTransformers format: {str(e)}")
            return False


def main():
    """Main extraction workflow"""
    print("🔬 PhysBERT Triplet Data Extractor")
    print("=" * 40)

    # Initialize extractor
    extractor = LatexTripletExtractor()

    # Extract from your LaTeX file
    latex_file = "/data/main.tex"

    if not os.path.exists(latex_file):
        print(f"❌ LaTeX file not found: {latex_file}")
        return

    # Extract triplets
    triplets = extractor.extract_triplets_from_latex(latex_file)

    if not triplets:
        print("❌ No triplets extracted")
        return

    # Show sample triplet
    print("\n📋 Sample triplet:")
    print("-" * 20)
    sample = triplets[0]
    print(f"Query: {sample['query'][:100]}...")
    print(f"Positive: {sample['positive'][:100]}...")
    print(f"Negative: {sample['negative'][:100]}...")

    # Save in different formats
    print("\n💾 Saving data...")
    extractor.save_to_json("data/physics_triplets.json")
    extractor.save_for_sentence_transformers("data/physics_triplets_st.json")

In [2]:
main()

🔬 PhysBERT Triplet Data Extractor
✅ Successfully extracted 53 triplets

📋 Sample triplet:
--------------------
Query: How is the mean rate of energy loss for a heavy charged particle described at intermediate energies?...
Positive: The mean rate of energy loss by moderately relativistic charged heavy particles is well described by...
Negative: Stopping power differs somewhat for electrons and positrons, and both differ from stopping power for...

💾 Saving data...
✅ Saved 53 triplets to data/physics_triplets.json
✅ Saved SentenceTransformers format to data/physics_triplets_st.json
