<a href="https://colab.research.google.com/github/Saoudyahya/BioDataExtract/blob/main/BioDataExtract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install gradio pdfplumber spacy pandas xlsxwriter

Collecting gradio
  Downloading gradio-5.23.1-py3-none-any.whl.metadata (16 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting xlsxwriter
  Downloading XlsxWriter-3.2.2-py3-none-any.whl.metadata (2.8 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB

In [4]:
import gradio as gr
import pdfplumber
import spacy
import re
import pandas as pd
import tempfile
import os
from io import BytesIO

# Load the SpaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("SpaCy model not found. Please download it using: python -m spacy download en_core_web_sm")
    nlp = spacy.blank("en")

# Add scientific entity patterns
ruler = nlp.add_pipe("entity_ruler", name="entity_ruler")
patterns = [
    {"label": "SPECIES", "pattern": [{"LOWER": {"REGEX": "[a-z]+"}}], "id": "scientific_species"},
    {"label": "MEASUREMENT", "pattern": [{"SHAPE": "d+.d+"}, {"LOWER": {"IN": ["mm", "cm", "m", "kg", "g"]}}]},
    {"label": "LENGTH", "pattern": [{"LOWER": "fork"}, {"LOWER": "length"}]},
]
ruler.add_patterns(patterns)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_bytes):
    try:
        # Create a BytesIO object from the bytes
        pdf_io = BytesIO(pdf_bytes)

        text = ""
        with pdfplumber.open(pdf_io) as pdf:
            for page in pdf.pages:
                extracted_text = page.extract_text()
                if extracted_text:
                    text += extracted_text + "\n"

        if not text.strip():
            return "No text could be extracted from the PDF."

        return text
    except Exception as e:
        return f"Error processing PDF: {str(e)}"

# Function to extract entities
def extract_entities(text, entity_types):
    if isinstance(text, str) and not text.startswith("Error"):
        try:
            # Limit text size to avoid memory issues
            text = text[:100000]  # Limit to first 100k characters
            doc = nlp(text)

            # Create a dictionary to store extracted entities
            extracted = {entity_type: [] for entity_type in entity_types}

            # Extract required entity types
            for ent in doc.ents:
                if ent.label_ in entity_types:
                    extracted[ent.label_].append(ent.text)

            # Special case for scientific species names
            if "SPECIES" in entity_types:
                scientific_names = re.findall(r'[A-Z][a-z]+ [a-z]+', text)
                extracted["SPECIES"].extend(scientific_names)

            # Special case for measurements
            if "MEASUREMENT" in entity_types:
                measurements = re.findall(r'\d+\.?\d*\s*(?:mm|cm|m|kg|g)', text)
                extracted["MEASUREMENT"].extend(measurements)

            # Special case for fork length
            if "LENGTH" in entity_types:
                fork_lengths = re.findall(r'(?:fork|total)?\s*length\s*(?:of)?\s*\d+\.?\d*\s*(?:mm|cm|m)', text, re.IGNORECASE)
                extracted["LENGTH"].extend(fork_lengths)

            # Remove duplicates
            for entity_type in entity_types:
                extracted[entity_type] = list(set(extracted[entity_type]))

            return extracted
        except Exception as e:
            return f"Error extracting entities: {str(e)}"
    else:
        return f"Cannot extract entities: {text}"

# Function for the Gradio interface
def process_pdf(pdf_file, species, measurements, length, person, org, date, gpe, loc):
    if pdf_file is None:
        return "Please upload a PDF file.", None

    # Get selected entity types
    entity_types = []
    if species: entity_types.append("SPECIES")
    if measurements: entity_types.append("MEASUREMENT")
    if length: entity_types.append("LENGTH")
    if person: entity_types.append("PERSON")
    if org: entity_types.append("ORG")
    if date: entity_types.append("DATE")
    if gpe: entity_types.append("GPE")
    if loc: entity_types.append("LOC")

    if not entity_types:
        return "Please select at least one entity type to extract.", None

    try:
        # Process the PDF file (which is now in bytes format)
        text = extract_text_from_pdf(pdf_file)

        if isinstance(text, str) and text.startswith("Error"):
            return text, None

        # Extract entities
        extracted = extract_entities(text, entity_types)

        if isinstance(extracted, str) and extracted.startswith("Error"):
            return extracted, None

        # Prepare results
        result_text = "## Extracted Entities\n\n"

        try:
            # Create Excel file with multiple sheets
            output = BytesIO()
            with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                for entity_type in entity_types:
                    if extracted[entity_type]:
                        result_text += f"### {entity_type}\n"
                        for item in extracted[entity_type]:
                            result_text += f"- {item}\n"
                        result_text += "\n"

                        # Add sheet to Excel file
                        df = pd.DataFrame({entity_type: extracted[entity_type]})
                        df.to_excel(writer, sheet_name=entity_type[:31], index=False)
                    else:
                        result_text += f"### {entity_type}\n"
                        result_text += "No entities of this type found.\n\n"

            output.seek(0)
            return result_text, output.getvalue()
        except Exception as e:
            return f"Error creating Excel file: {str(e)}", None
    except Exception as e:
        return f"Unexpected error: {str(e)}", None

# Create Gradio interface
with gr.Blocks(title="Scientific Entity Extraction Tool") as demo:
    gr.Markdown("# Scientific Entity Extraction Tool")
    gr.Markdown("Upload a scientific PDF and select entities to extract")

    with gr.Row():
        with gr.Column(scale=1):
            # Input components - explicitly use binary format
            pdf_input = gr.File(label="Upload PDF Document", type="binary")

            gr.Markdown("### Select Entities to Extract")
            species_cb = gr.Checkbox(label="Species Names", value=True)
            measurements_cb = gr.Checkbox(label="Measurements", value=True)
            length_cb = gr.Checkbox(label="Length Measurements", value=True)
            person_cb = gr.Checkbox(label="Person Names", value=False)
            org_cb = gr.Checkbox(label="Organizations", value=False)
            date_cb = gr.Checkbox(label="Dates", value=False)
            gpe_cb = gr.Checkbox(label="Geopolitical Entities", value=False)
            loc_cb = gr.Checkbox(label="Locations", value=False)

            extract_button = gr.Button("Extract Entities")

        with gr.Column(scale=2):
            # Output components
            output_text = gr.Markdown()
            output_file = gr.File(label="Download Results")

    # Set up event handler
    extract_button.click(
        process_pdf,
        inputs=[
            pdf_input,
            species_cb, measurements_cb, length_cb,
            person_cb, org_cb, date_cb, gpe_cb, loc_cb
        ],
        outputs=[output_text, output_file]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2834885ef0729a827a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [7]:
import pdfplumber
import spacy
import re
import pandas as pd
import tempfile
import os
from io import BytesIO

# Load the SpaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("SpaCy model not found. Please download it using: python -m spacy download en_core_web_sm")
    nlp = spacy.blank("en")

# Add scientific entity patterns
try:
    ruler = nlp.add_pipe("entity_ruler")
    patterns = [
        {"label": "SPECIES", "pattern": [{"LOWER": {"REGEX": "[a-z]+"}}], "id": "scientific_species"},
        {"label": "MEASUREMENT", "pattern": [{"SHAPE": "d+.d+"}, {"LOWER": {"IN": ["mm", "cm", "m", "kg", "g"]}}]},
        {"label": "LENGTH", "pattern": [{"LOWER": "fork"}, {"LOWER": "length"}]},
    ]
    ruler.add_patterns(patterns)
except Exception as e:
    print(f"Error setting up entity ruler: {e}")
    if "entity_ruler" not in nlp.pipe_names:
        ruler = nlp.add_pipe("entity_ruler", name="entity_ruler")

# Function to extract text from PDF with better error handling
def extract_text_from_pdf(pdf_path):
    if not pdf_path:
        return "No file provided."

    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                extracted_text = page.extract_text()
                if extracted_text:
                    text += extracted_text + "\n"

        if not text.strip():
            return "No text could be extracted from the PDF."

        return text
    except Exception as e:
        return f"Error processing PDF: {str(e)}"

# Function to extract entities with better error handling
def extract_entities(text, entity_types):
    if isinstance(text, str) and not text.startswith("Error"):
        try:
            # Limit text size to avoid memory issues
            text = text[:100000]  # Limit to first 100k characters
            doc = nlp(text)

            # Create a dictionary to store extracted entities
            extracted = {entity_type: [] for entity_type in entity_types}

            # Extract required entity types
            for ent in doc.ents:
                if ent.label_ in entity_types:
                    extracted[ent.label_].append(ent.text)

            # Special case for scientific species names
            if "SPECIES" in entity_types:
                scientific_names = re.findall(r'[A-Z][a-z]+ [a-z]+', text)
                extracted["SPECIES"].extend(scientific_names)

            # Special case for measurements
            if "MEASUREMENT" in entity_types:
                measurements = re.findall(r'\d+\.?\d*\s*(?:mm|cm|m|kg|g)', text)
                extracted["MEASUREMENT"].extend(measurements)

            # Special case for fork length
            if "LENGTH" in entity_types:
                fork_lengths = re.findall(r'(?:fork|total)?\s*length\s*(?:of)?\s*\d+\.?\d*\s*(?:mm|cm|m)', text, re.IGNORECASE)
                extracted["LENGTH"].extend(fork_lengths)

            # Remove duplicates
            for entity_type in entity_types:
                extracted[entity_type] = list(set(extracted[entity_type]))

            return extracted
        except Exception as e:
            return f"Error extracting entities: {str(e)}"
    else:
        return f"Cannot extract entities: {text}"

# Main function for processing PDFs
def process_pdf(pdf_path, entity_types):
    if not pdf_path:
        return "Please provide a PDF file path."

    if not entity_types:
        return "Please specify at least one entity type to extract."

    try:
        # Extract text from PDF
        text = extract_text_from_pdf(pdf_path)

        print(f"Extracted text length: {len(text) if isinstance(text, str) else 'Not a string'}")

        if isinstance(text, str) and text.startswith("Error"):
            return text, None

        # Extract entities
        extracted = extract_entities(text, entity_types)

        if isinstance(extracted, str) and extracted.startswith("Error"):
            return extracted, None

        # Prepare results
        result_text = "## Extracted Entities\n\n"

        try:
            # Create Excel file with multiple sheets
            output_file = "extracted_entities.xlsx"
            with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
                for entity_type in entity_types:
                    if extracted[entity_type]:
                        result_text += f"### {entity_type}\n"
                        for item in extracted[entity_type]:
                            result_text += f"- {item}\n"
                        result_text += "\n"

                        # Add sheet to Excel file
                        df = pd.DataFrame({entity_type: extracted[entity_type]})
                        df.to_excel(writer, sheet_name=entity_type[:31], index=False)
                    else:
                        result_text += f"### {entity_type}\n"
                        result_text += "No entities of this type found.\n\n"

            print(result_text)
            print(f"Results saved to {output_file}")
            return result_text, output_file
        except Exception as e:
            return f"Error creating Excel file: {str(e)}", None
    except Exception as e:
        return f"Unexpected error: {str(e)}", None

# Example usage
if __name__ == "__main__":
    # Define the path to your PDF file
    pdf_path = input("Enter the path to your PDF file: ")

    # Define entity types to extract
    print("\nSelect entity types to extract (enter y/n):")
    entity_types = []

    if input("Species Names (y/n): ").lower() == 'y':
        entity_types.append("SPECIES")
    if input("Measurements (y/n): ").lower() == 'y':
        entity_types.append("MEASUREMENT")
    if input("Length Measurements (y/n): ").lower() == 'y':
        entity_types.append("LENGTH")
    if input("Person Names (y/n): ").lower() == 'y':
        entity_types.append("PERSON")
    if input("Organizations (y/n): ").lower() == 'y':
        entity_types.append("ORG")
    if input("Dates (y/n): ").lower() == 'y':
        entity_types.append("DATE")
    if input("Geopolitical Entities (y/n): ").lower() == 'y':
        entity_types.append("GPE")
    if input("Locations (y/n): ").lower() == 'y':
        entity_types.append("LOC")

    # Process the PDF
    result, output_file = process_pdf(pdf_path, entity_types)

    # Display the result
    print("\nResult:")
    print(result)
    if output_file:
        print(f"Results saved to: {output_file}")YYYYY

Enter the path to your PDF file: /content/1706.03762v7 (1).pdf

Select entity types to extract (enter y/n):
Species Names (y/n): Y
Measurements (y/n): Y
Length Measurements (y/n): Y
Person Names (y/n): 
Organizations (y/n): Y
Dates (y/n): 
Geopolitical Entities (y/n): 
Locations (y/n): 




Extracted text length: 35526
## Extracted Entities

### SPECIES
- For translation
- amount
- scholarlyworks
- Forthebigmodels
- Head
- performing
- ls
- si
- attentionmechanism
- normalization
- theoutputofthepreviouslayerinthe
- range
- Differentcolorsrepresentdifferentheads
- Wesetthemaximumoutputlengthduring
- evaluate
- andWO
- separable
- accurate
- a√ndvaluesofdimensiond
- guA
- WSJonly
- inner
- attentionovertheoutputoftheencoderstack
- mationandsoftmaxfunctiontoconvertthedecoderoutputtopredictednext
- transduction
- Proceedings
- between
- dispensingwithrecurrenceandconvolutions
- right
- ofthesoftmaxwhichcorrespondtoillegalconnections
- arXivpreprintarXiv:1511.06114,2015
- implemented
- new
- aswellastheembedding
- Adam
- languagemodelingtasks[34
- end
- params
- replacingtherecurrentlayersmostcommonlyusedinencoder
- JamieRyanKiros
- QKT
- WMT
- MultiHead(Q
- fordifferentlayertypes
- continuous
- Whilethetwoaresimilarintheoreticalcomplexity
- asmallfractionofthetrainingcostsof

In [9]:
import gradio as gr
import pdfplumber
import spacy
import re
import pandas as pd
import tempfile
import os
from io import BytesIO

# Load the SpaCy model
try:
    nlp = spacy.load("en_core_web_sm")
    print("SpaCy model loaded successfully")
except OSError:
    print("SpaCy model not found. Please download it using: python -m spacy download en_core_web_sm")
    nlp = spacy.blank("en")

# Add scientific entity patterns
try:
    ruler = nlp.add_pipe("entity_ruler")
    patterns = [
        {"label": "SPECIES", "pattern": [{"LOWER": {"REGEX": "[a-z]+"}}], "id": "scientific_species"},
        {"label": "MEASUREMENT", "pattern": [{"SHAPE": "d+.d+"}, {"LOWER": {"IN": ["mm", "cm", "m", "kg", "g"]}}]},
        {"label": "LENGTH", "pattern": [{"LOWER": "fork"}, {"LOWER": "length"}]},
    ]
    ruler.add_patterns(patterns)
    print("Entity patterns added successfully")
except Exception as e:
    print(f"Error setting up entity ruler: {e}")
    if "entity_ruler" not in nlp.pipe_names:
        ruler = nlp.add_pipe("entity_ruler", name="entity_ruler")

# Function to extract text from PDF with better error handling
def extract_text_from_pdf(pdf_file):
    if pdf_file is None:
        return "No file provided."

    try:
        # Save uploaded file to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
            if isinstance(pdf_file, bytes):
                tmp.write(pdf_file)
            else:
                tmp.write(pdf_file.read())
            tmp_path = tmp.name

        print(f"Temporary PDF saved to: {tmp_path}")

        text = ""
        try:
            with pdfplumber.open(tmp_path) as pdf:
                for page in pdf.pages:
                    extracted_text = page.extract_text()
                    if extracted_text:
                        text += extracted_text + "\n"
                    print(f"Extracted page with {len(extracted_text) if extracted_text else 0} characters")
        except Exception as e:
            print(f"Error in pdfplumber: {str(e)}")
            return f"Error extracting text from PDF: {str(e)}"
        finally:
            # Clean up the temporary file
            try:
                os.unlink(tmp_path)
                print("Temporary file cleaned up")
            except Exception as e:
                print(f"Failed to clean up temp file: {e}")

        if not text.strip():
            return "No text could be extracted from the PDF."

        print(f"Successfully extracted {len(text)} characters from PDF")
        return text
    except Exception as e:
        print(f"General error in extract_text_from_pdf: {str(e)}")
        return f"Error processing PDF: {str(e)}"

# Function to extract entities with better error handling
def extract_entities(text, entity_types):
    if isinstance(text, str) and not text.startswith("Error"):
        try:
            # Limit text size to avoid memory issues
            text = text[:100000]  # Limit to first 100k characters
            print(f"Processing {len(text)} characters for entity extraction")

            doc = nlp(text)
            print(f"SpaCy document created with {len(doc)} tokens")

            # Create a dictionary to store extracted entities
            extracted = {entity_type: [] for entity_type in entity_types}

            # Extract required entity types
            for ent in doc.ents:
                if ent.label_ in entity_types:
                    extracted[ent.label_].append(ent.text)

            # Special case for scientific species names
            if "SPECIES" in entity_types:
                scientific_names = re.findall(r'[A-Z][a-z]+ [a-z]+', text)
                extracted["SPECIES"].extend(scientific_names)
                print(f"Found {len(scientific_names)} potential scientific species names")

            # Special case for measurements
            if "MEASUREMENT" in entity_types:
                measurements = re.findall(r'\d+\.?\d*\s*(?:mm|cm|m|kg|g)', text)
                extracted["MEASUREMENT"].extend(measurements)
                print(f"Found {len(measurements)} potential measurements")

            # Special case for fork length
            if "LENGTH" in entity_types:
                fork_lengths = re.findall(r'(?:fork|total)?\s*length\s*(?:of)?\s*\d+\.?\d*\s*(?:mm|cm|m)', text, re.IGNORECASE)
                extracted["LENGTH"].extend(fork_lengths)
                print(f"Found {len(fork_lengths)} potential length measurements")

            # Remove duplicates
            for entity_type in entity_types:
                extracted[entity_type] = list(set(extracted[entity_type]))
                print(f"Found {len(extracted[entity_type])} unique {entity_type} entities")

            return extracted
        except Exception as e:
            print(f"Error in extract_entities: {str(e)}")
            return f"Error extracting entities: {str(e)}"
    else:
        print(f"Cannot extract entities from invalid text")
        return f"Cannot extract entities: {text}"

# Function for the Gradio interface with extensive error handling and debugging
# Function for the Gradio interface with extensive error handling and debugging
def process_pdf(pdf_file, species, measurements, length, person, org, date, gpe, loc):
    # ... (previous code: PDF check, entity selection, text extraction, entity extraction) ...

    try:
        # Extract text from PDF
        text = extract_text_from_pdf(pdf_file)

        # ... (text error check) ...

        # Extract entities
        extracted = extract_entities(text, entity_types)

        # ... (extracted error check) ...

        # Prepare results
        result_text = "## Extracted Entities\n\n"
        excel_file_path = None # Initialize file path variable

        try:
            # Create Excel file in memory first
            output = BytesIO()
            with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
                has_data = False # Flag to check if any data was written
                for entity_type in entity_types:
                    if extracted[entity_type]:
                        has_data = True
                        result_text += f"### {entity_type}\n"
                        for item in extracted[entity_type]:
                            result_text += f"- {item}\n"
                        result_text += "\n"

                        # Add sheet to Excel file
                        df = pd.DataFrame({entity_type: extracted[entity_type]})
                        # Ensure sheet name is valid (Excel limits to 31 chars, avoid certain chars)
                        safe_sheet_name = re.sub(r'[\\/*?:\[\]]', '_', entity_type)[:31]
                        df.to_excel(writer, sheet_name=safe_sheet_name, index=False)
                        print(f"Added {len(extracted[entity_type])} {entity_type} entities to Excel buffer")
                    else:
                        result_text += f"### {entity_type}\n"
                        result_text += "No entities of this type found.\n\n"

            if has_data:
                output.seek(0)
                print("Excel file created in memory buffer")

                # --- CHANGE START ---
                # Save the buffer to a temporary file instead of returning bytes
                try:
                    # Create a temporary file that Gradio can access
                    # delete=False is important so the file isn't removed before Gradio reads it
                    with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', mode='wb') as tmp_excel:
                        tmp_excel.write(output.getvalue())
                        excel_file_path = tmp_excel.name # Get the path to the temp file
                    print(f"Excel file saved temporarily to: {excel_file_path}")
                except Exception as e:
                    print(f"Error saving temporary Excel file: {str(e)}")
                    result_text += f"\n\n**Error:** Could not save Excel file for download: {str(e)}"
                    excel_file_path = None # Ensure path is None on error
                # --- CHANGE END ---
            else:
                 print("No data found for any selected entity type. Excel file not generated.")
                 result_text += "\n\n**Note:** No data found for selected types, so no Excel file was generated."


            # Return the result text and the PATH to the file (or None)
            return result_text, excel_file_path

        except Exception as e:
            print(f"Error creating Excel file content: {str(e)}")
            return f"Error creating Excel file content: {str(e)}", None # Return None for the file path on error

    except Exception as e:
        print(f"Unexpected error in process_pdf: {str(e)}")
        return f"Unexpected error: {str(e)}", None # Return None for the file path on error
# Create Gradio interface
def create_gradio_interface():
    with gr.Blocks(title="Scientific Entity Extraction Tool") as demo:
        gr.Markdown("# Scientific Entity Extraction Tool")
        gr.Markdown("Upload a scientific PDF and select entities to extract")

        with gr.Row():
            with gr.Column(scale=1):
                # Input components
                pdf_input = gr.File(label="Upload PDF Document", type="binary")

                gr.Markdown("### Select Entities to Extract")
                species_cb = gr.Checkbox(label="Species Names", value=True)
                measurements_cb = gr.Checkbox(label="Measurements", value=True)
                length_cb = gr.Checkbox(label="Length Measurements", value=True)
                person_cb = gr.Checkbox(label="Person Names", value=False)
                org_cb = gr.Checkbox(label="Organizations", value=False)
                date_cb = gr.Checkbox(label="Dates", value=False)
                gpe_cb = gr.Checkbox(label="Geopolitical Entities", value=False)
                loc_cb = gr.Checkbox(label="Locations", value=False)

                extract_button = gr.Button("Extract Entities")

            with gr.Column(scale=2):
                # Output components
                output_text = gr.Markdown()
                output_file = gr.File(label="Download Results")

        # Set up event handler
        extract_button.click(
            process_pdf,
            inputs=[
                pdf_input,
                species_cb, measurements_cb, length_cb,
                person_cb, org_cb, date_cb, gpe_cb, loc_cb
            ],
            outputs=[output_text, output_file]
        )

        # Add debugging information
        gr.Markdown("### Debug Information")
        debug_output = gr.Textbox(label="Debug Log", lines=5)

        # Override print to capture debug output
        original_print = print
        def debug_print(*args, **kwargs):
            original_print(*args, **kwargs)
            message = " ".join(str(arg) for arg in args)
            debug_output.update(message + "\n" + debug_output.value)

        # Not actually overriding print, but demonstrating the concept
        extract_button.click(
            lambda: "Starting extraction...",
            inputs=None,
            outputs=debug_output
        )

    return demo

# Launch the app
if __name__ == "__main__":
    print("Starting Scientific Entity Extraction Tool")
    demo = create_gradio_interface()
    demo.launch(debug=True)  # Enable debug mode
    print("Gradio interface launched")

SpaCy model loaded successfully
Entity patterns added successfully
Starting Scientific Entity Extraction Tool
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://45019d4570965a942b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Temporary PDF saved to: /tmp/tmpsvvtzv6m.pdf
Extracted page with 2580 characters
Extracted page with 3778 characters
Extracted page with 1616 characters
Extracted page with 2216 characters
Extracted page with 2843 characters
Extracted page with 3066 characters
Extracted page with 2921 characters
Extracted page with 2766 characters
Extracted page with 2694 characters
Extracted page with 2803 characters
Extracted page with 2963 characters
Extracted page with 2934 characters
Extracted page with 775 characters
Extracted page with 777 characters




Extracted page with 779 characters
Temporary file cleaned up
Successfully extracted 35526 characters from PDF
Processing 35526 characters for entity extraction
SpaCy document created with 4578 tokens
Found 53 potential scientific species names
Found 5 potential measurements
Found 0 potential length measurements
Found 1188 unique SPECIES entities
Found 5 unique MEASUREMENT entities
Found 0 unique LENGTH entities
Found 87 unique ORG entities
Added 1188 SPECIES entities to Excel buffer
Added 5 MEASUREMENT entities to Excel buffer
Added 87 ORG entities to Excel buffer
Excel file created in memory buffer
Excel file saved temporarily to: /tmp/tmpz48az_7l.xlsx
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7863 <> https://45019d4570965a942b.gradio.live
Gradio interface launched
