In [4]:
import re
import cleantext
import PyPDF2
#import camelot
import os
class ResearchPaperCleaner:
    def __init__(self):
        pass

    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """
        Extract text from a PDF file using PyPDF2.
        """
        try:
            print(f"[DEBUG] Attempting to extract text from PDF: {pdf_path}")
            with open(pdf_path, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text()
            print("[DEBUG] Text extraction successful.")
            return text
        except Exception as e:
            print(f"[ERROR] Failed to extract text from PDF: {e}")
            return ""
        
    def extract_tables(self, pdf_path):
        tables = camelot.read_pdf(pdf_path)
        return tables

    def clean(self, text):
        """Main method to clean the research paper text"""
        text = self._remove_references(text)
        text = self._remove_reference_tags(text)
        text = self._remove_authors_section(text)
        text = self._remove_headers_footers(text)
        text = self._remove_table_and_graph_data(text)
        text = self._clean_text(text)
        return text

    def _remove_references(self, text):
        """Remove reference text at the end of the paper"""
        pattern = r'\nReferences\n.*'
        cleaned_text = re.sub(pattern, '', text, flags=re.DOTALL)
        return cleaned_text

    def _remove_table_and_graph_data(self, text):
        # Remove lines with numeric data (potential table rows)
        text = re.sub(r'^\s*[\d\.\-]+(?:\s+[\d\.\-]+)*\s*$', '', text, flags=re.MULTILINE)
        
        # Remove figure captions
        text = re.sub(r'fig\.\s*\d+\..*$', '', text, flags=re.MULTILINE | re.IGNORECASE)
        
        # Remove isolated numbers (potential axis labels)
        text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
        
        # Remove lines with mostly special characters (potential graph elements)
        text = re.sub(r'^\s*[^\w\s]*(?:\s*[^\w\s]+){3,}\s*$', '', text, flags=re.MULTILINE)
        
        return text

    def _remove_reference_tags(self, text):
        """Remove reference tags like '[1]'"""
        pattern = r'\[\d+\]'
        cleaned_text = re.sub(pattern, '', text)
        return cleaned_text

    def _remove_authors_section(self, text):
        """Remove the authors section"""
        pattern = r'^.*?(?=\n\n)'  # Matches from start until first empty line
        cleaned_text = re.sub(pattern, '', text, flags=re.DOTALL)
        return cleaned_text

    def _remove_headers_footers(self, text):
        """Remove headers and footers"""
        lines = text.split('\n')
        cleaned_lines = [line for line in lines if not re.match(r'^(Page \d+|Journal Title|Author Name)$', line)]
        return '\n'.join(cleaned_lines)

    def _clean_text(self, text):
        """Clean extra whitespace and normalize text"""
        cleaned = cleantext.clean(text,
            extra_spaces=True,
            lowercase=True,
            numbers=False,
            punct=False
        )
        return cleaned

In [5]:
# Create an instance of the cleaner
cleaner = ResearchPaperCleaner()

path = 'nutrition_research_papers'
files = os.listdir(path)

for file in files:
    print(f'{path}/{file}')
    if file.endswith('.pdf'):
        paper_text = cleaner.extract_text_from_pdf(f'{path}/{file}')
        cleaned_text = cleaner.clean(paper_text)
        print(cleaned_text)
        break
        with open(f'combined_nutrition_papers.txt', 'a') as f:
            f.write(cleaned_text)

print("[INFO] Cleaning and Combining complete.")

nutrition_research_papers/s41467-023-41969-1.pdf
[DEBUG] Attempting to extract text from PDF: nutrition_research_papers/s41467-023-41969-1.pdf
[DEBUG] Text extraction successful.
article https://doi.org/10.1038/s41467-023-41969-1
the personalized nutrition study (points):
evaluation of a genetically informed weightloss approach, a randomized clinical trial
christoph höchsmann1,2, shengping yang2,j o s ém .o r d o v á s3,
james l. dorling4, catherine m. champagne2, john w. apolzan2,
frank l. greenway2,m i c h e l l ei .c a r d e l5,6,g a r yd .f o s t e r5,7&
corby k. martin2
weight loss (wl) differences between iso caloric high-carbohydrate and high-fat
diets are generally small; however, indi vidual wl varies within diet groups.
genotype patterns may modify diet effect s, with carbohydrate-responsive gen-
otypes losing more weight on high-carbohydrate diets (and vice versa for fat-responsive genotypes). we investigate d whether 12-week wl (kg, primary out-
come) differs between genoty

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/npatel237/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import camelot
import fitz
import io
import os
from PIL import Image
import csv

class PDFGraphicalExtractor:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path

    def extract_tables(self):
        return camelot.read_pdf(self.pdf_path)

    def extract_images(self):
        doc = fitz.open(self.pdf_path)
        images = []
        for page_num in range(len(doc)):
            page = doc[page_num]
            image_list = page.get_images()
            for img_index, img in enumerate(image_list):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                images.append((page_num, img_index, image_bytes))
        doc.close()
        return images

    def extract_all(self):
        tables = self.extract_tables()
        images = self.extract_images()
        return tables, images

# Usage


path = 'nutrition_research_papers'
files = os.listdir(path)

for file in files:
    if file.endswith('.pdf'):
        extractor = PDFGraphicalExtractor(f'{path}/{file}')
        tables, images = extractor.extract_all()
        
        # Save tables as CSV
        for i, table in enumerate(tables):
            table.to_csv(f'tables/{file}_table_{i}.csv', index=False)
        
        # Save images as PNG
        for i, (page_num, img_index, image_bytes) in enumerate(images):
            try:
                image = Image.open(io.BytesIO(image_bytes))
                image.save(f'images/{file}_image_{i}.png')
            except Exception as e:
                print(f"[ERROR] Failed to save image: {e}")
                continue

print("[DEBUG] Extraction complete.")


In [4]:
import os
import re
import fitz  # PyMuPDF for PDFs
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    print(f"Text extracted from {pdf_path}")
    return text

def clean_text(text):
    # Remove References section (if applicable)
    text = re.sub(r'References.*', '', text, flags=re.DOTALL)

    # Remove headers/footers (example pattern, adjust as needed)
    text = re.sub(r'Header text pattern.*', '', text, flags=re.MULTILINE)
    text = re.sub(r'Footer text pattern.*', '', text, flags=re.MULTILINE)

    # Remove non-alphanumeric characters (if necessary) and extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s.,?!:;\'"-]', '', text)
    text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces
    text = text.strip()
    
    # Optional: Convert to lowercase to standardize
    text = text.lower()
    
    return text

def chunk_text(text, chunk_size=500):
    # Process the text with SpaCy
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    chunks = []
    chunk = ""

    for sentence in sentences:
        if len(chunk) + len(sentence) > chunk_size:
            chunks.append(chunk)
            chunk = sentence
        else:
            chunk += " " + sentence

    if chunk:
        chunks.append(chunk)
    print(f"Chunks extracted")
    return chunks

def process_files_in_folder(folder_path):
    combined_text = ""
    
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        print(f"{file_name} is under processing...")
        if file_name.endswith(".pdf"):
            raw_text = extract_text_from_pdf(file_path)
        else:
            continue  # Skip non-supported file types
        
        cleaned_text = clean_text(raw_text)
        chunks = chunk_text(cleaned_text)
        
        # Combine all chunks into one text (can save to individual files or a combined file)
        combined_text += "\n".join(chunks) + "\n"
    
    return combined_text

# Specify the folder containing your research files
folder_path = "resistant_research_papers"

# Process the files in the folder and get the combined cleaned text
combined_cleaned_text = process_files_in_folder(folder_path)

# Save the combined cleaned and chunked text to a file
output_file = "resistant.txt"
with open(output_file, 'w', encoding='utf-8') as file:
    file.write(combined_cleaned_text)

print(f"Combined text has been saved to {output_file}")

ijerph-16-04897.pdf is under processing...
Text extracted from resistant_research_papers/ijerph-16-04897.pdf
Chunks extracted
msse-53-1206.pdf is under processing...
Text extracted from resistant_research_papers/msse-53-1206.pdf
Chunks extracted
2102.00836v2.pdf is under processing...
Text extracted from resistant_research_papers/2102.00836v2.pdf
Chunks extracted
fphys-12-791999.pdf is under processing...
Text extracted from resistant_research_papers/fphys-12-791999.pdf
Chunks extracted
fspor-04-949021.pdf is under processing...
Text extracted from resistant_research_papers/fspor-04-949021.pdf
Chunks extracted
jfmk-09-00009.pdf is under processing...
Text extracted from resistant_research_papers/jfmk-09-00009.pdf
Chunks extracted
Combined text has been saved to resistant.txt


Using Gemini for advanced agentic chunking:

1. Attempting Section based chunking/ seperating

    Problems:
    
        a. Loss of data

In [None]:
from langchain import hub
from langchain_core.pydantic_v1 import BaseModel, Field
from agentic_chunker import AgenticChunker
from langchain_core.messages import HumanMessage
from langchain_google_genai import GoogleGenerativeAI
from dotenv import load_dotenv
import PyPDF2
import os
from typing import List

# Load environment variables
load_dotenv()

print("Libraries loaded successfully")

# Initialize Gemini client
llm = GoogleGenerativeAI(
    model="gemini-1.0-pro",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.3
)

def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract text from a PDF file using PyPDF2.
    """
    try:
        print(f"[DEBUG] Attempting to extract text from PDF: {pdf_path}")
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
        print("[DEBUG] Text extraction successful.")
        return text
    except Exception as e:
        print(f"[ERROR] Failed to extract text from PDF: {e}")
        return ""

def create_prompt(text: str) -> str:
    """
    Create the agentic chunking prompt for research papers.
    """
    print("[DEBUG] Creating the agentic chunking prompt...")
    prompt = f"""
    You are an AI model trained to analyze and extract only the relevant data from scientific research papers.
    Your task is to extract and organize the content of the provided paper into the following sections:

    1. Objectives: Clearly describe the purpose or aim of the study.
    2. Methods: Summarize the methodology, including design, participants, and analysis techniques.
    3. Results: Provide key findings or outcomes of the study.
    4. Discussion: Highlight the relevance and implications of the findings.
    5. Conclusion: Summarize the main takeaways.
    6. Practical Applications: Explain how the findings can be applied in real-world contexts.
    7. References: Automatically detect and include the references or citations for this research paper.

    Include all text from the paper in the response, ensuring no details are omitted. 

    Here is the content of the research paper:

    {text}
    """
    print("[DEBUG] Prompt successfully created.")
    return prompt

def parse_response(response: str) -> dict:
    """
    Dynamically parse the response into structured sections.
    """
    print("[DEBUG] Parsing response from the LLM...")
    sections = {}
    current_section = None

    for line in response.splitlines():
        line = line.strip()
        if line.startswith("**") and line.endswith(":**"):
            # Detect section headers like "**Objectives:**"
            current_section = line.strip("*:").strip()
            sections[current_section] = ""
        elif current_section:
            # Append content to the current section
            sections[current_section] += line + " "

    print("[DEBUG] Parsing complete.")
    return {k: v.strip() for k, v in sections.items()}

def process_research_paper(pdf_path: str):
    """
    Process a research paper PDF file and extract structured information.
    """
    try:
        # Step 1: Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        if not text:
            print("[ERROR] No text extracted. Exiting...")
            return

        # Step 2: Create the prompt
        prompt = create_prompt(text)

        # Step 3: Use the LLM to process the prompt
        print("[DEBUG] Sending prompt to the LLM...")
        message = HumanMessage(content=prompt)
        response = llm.invoke([message])
        print(f"[DEBUG] Response received:\n{response}")

        # Step 4: Parse the response into structured sections
        sections = parse_response(response)
        print("[DEBUG] Structured sections extracted.")

        # Step 5: Display the extracted information
        print("\nExtracted Information:")
        for section, content in sections.items():
            print(f"\n**{section}:**")
            print(content if content else "No information extracted.")


    except Exception as e:
        print(f"[ERROR] An error occurred: {e}")

# Example Usage
pdf_path = "resistant_research_papers/2102.00836v2.pdf"  # Path to your PDF
process_research_paper(pdf_path)


Libraries loaded successfully
[DEBUG] Attempting to extract text from PDF: resistant_research_papers/2102.00836v2.pdf
[DEBUG] Text extraction successful.
[DEBUG] Creating the agentic chunking prompt...
[DEBUG] Prompt successfully created.
[DEBUG] Sending prompt to the LLM...
[DEBUG] Response received:
**Objectives:**

* To elucidate the mechanism by which skeletal muscle senses and responds to mechanical load, leading to hypertrophy or atrophy.
* To develop a quantitative mathematical model that describes this process.

**Methods:**

* Literature review and theoretical analysis of muscle mechanics and mechanosensing.
* Development of a kinetic model incorporating titin kinase (TK) as the primary mechanosensor, signaling pathways, and ribosomal kinetics.
* Numerical simulations to examine model behavior under different exercise and detraining scenarios.

**Results:**

* TK is identified as a plausible mechanosensor due to its force-dependent conformational changes and its role in signal

2. Overall text chunking, without sectioning

    Problems:

        a. Irregualar Text generation
        b. Inconsistencies

In [None]:
from langchain import hub
from langchain_core.pydantic_v1 import BaseModel, Field
from agentic_chunker import AgenticChunker
from langchain_core.messages import HumanMessage
from langchain_google_genai import GoogleGenerativeAI
from dotenv import load_dotenv
import PyPDF2
import os

# Load environment variables
load_dotenv()

print("Libraries loaded successfully")

# Initialize Gemini client
llm = GoogleGenerativeAI(
    model="gemini-1.0-pro",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.3
)

def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract text from a PDF file using PyPDF2.
    """
    try:
        print(f"[DEBUG] Attempting to extract text from PDF: {pdf_path}")
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
        print("[DEBUG] Text extraction successful.")
        return text
    except Exception as e:
        print(f"[ERROR] Failed to extract text from PDF: {e}")
        return ""

def create_chunking_prompt(text: str) -> str:
    """
    Create a prompt for chunking the text into meaningful parts.
    """
    print("[DEBUG] Creating the chunking prompt...")
    prompt = f"""
    You are an AI assistant skilled in analyzing and chunking research paper text. Your task is to break the text into meaningful chunks. 
    Ensure the chunks has the original text from the paper. NO summarization; focus on grouping related sentences together.
    Remove any reference number added in the sentences.
    Here is the content of the research paper to chunk, do not create your own text, rather grough the text below together which have sementic similarities:

    {text}
    """
    print("[DEBUG] Prompt successfully created.")
    return prompt

def parse_chunking_response(response: str) -> list:
    """
    Parse the chunking response into a list of chunks.
    """
    print("[DEBUG] Parsing the chunked response...")
    chunks = response.split("\n\n")
    parsed_chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
    print("[DEBUG] Parsing complete.")
    return parsed_chunks

def process_research_paper_with_chunking(pdf_path: str):
    """
    Process a research paper PDF file and chunk its content into meaningful parts.
    """
    try:
        # Step 1: Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        if not text:
            print("[ERROR] No text extracted. Exiting...")
            return

        # Step 2: Create the chunking prompt
        prompt = create_chunking_prompt(text)

        # Step 3: Use the LLM to process the prompt
        print("[DEBUG] Sending prompt to the LLM...")
        message = HumanMessage(content=prompt)
        response = llm.invoke([message])
        print(f"[DEBUG] Response received:\n{response}")

        # Step 4: Parse the response into chunks
        chunks = parse_chunking_response(response)
        print("[DEBUG] Chunking complete. Total chunks extracted:", len(chunks))

        # Step 5: Display the extracted chunks
        print("\nExtracted Chunks:")
        for i, chunk in enumerate(chunks, 1):
            print(f"\n[Chunk {i}]:")
            print(chunk)

    except Exception as e:
        print(f"[ERROR] An error occurred: {e}")

# Example Usage
pdf_path = "resistant_research_papers/2102.00836v2.pdf"  # Path to your PDF
process_research_paper_with_chunking(pdf_path)


3. Only extracting text into 3 sections:

        a. Text data
        b. Graphs
        c. Tabular data

Manula Attempt

    Failed

In [None]:
from langchain_core.pydantic_v1 import BaseModel, Field
from dotenv import load_dotenv
from pdf2image import convert_from_path
import PyPDF2
import re
import time
import os

# Load environment variables
load_dotenv()

print("Libraries loaded successfully")


def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract all text from the PDF.
    """
    try:
        print(f"[DEBUG] Extracting text from PDF: {pdf_path}")
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
        print("[DEBUG] Text extraction successful.")
        return text
    except Exception as e:
        print(f"[ERROR] Text extraction failed: {e}")
        return ""


def clean_text_data(text: str) -> str:
    """
    Clean text data by removing references, reference numbers, and irrelevant content.
    """
    print("[DEBUG] Cleaning text data...")
    # Remove reference numbers like [1], [2], etc.
    cleaned_text = re.sub(r'\[\d+\]', '', text)
    # Remove the references section
    cleaned_text = re.sub(r'References.*$', '', cleaned_text, flags=re.DOTALL)
    print("[DEBUG] Text data cleaned successfully.")
    return cleaned_text.strip()


def extract_graphical_data(pdf_path: str, output_dir: str) -> list:
    """
    Extract images (figures, graphs) from the PDF and save them.
    """
    try:
        print("[DEBUG] Extracting graphical data...")
        images = convert_from_path(pdf_path)
        image_paths = []
        for i, image in enumerate(images):
            image_path = os.path.join(output_dir, f"graph_page_{i + 1}.png")
            image.save(image_path, "PNG")
            image_paths.append(image_path)
        print(f"[DEBUG] Extracted {len(image_paths)} images.")
        return image_paths
    except Exception as e:
        print(f"[ERROR] Graphical data extraction failed: {e}")
        return []


def extract_tabular_data(text: str) -> list:
    """
    Extract tabular data using regex or simple pattern matching.
    """
    print("[DEBUG] Extracting tabular data...")
    tables = re.findall(r'(\w+(?:\s+\w+)*\s+\d+(?:\.\d+)?(?:\s+\d+(?:\.\d+)?)+)', text)
    print(f"[DEBUG] Extracted {len(tables)} tables.")
    return tables


def divide_research_paper(pdf_path: str, output_dir: str):
    """
    Process the PDF and divide the data into Text Data, Graphical Data, and Tabular Data.
    """
    try:
        print(f"[DEBUG] Processing PDF: {pdf_path}")

        # Extract text
        raw_text = extract_text_from_pdf(pdf_path)
        if not raw_text:
            print("[ERROR] No text data found in the PDF.")
            return

        # Clean text
        text_data = clean_text_data(raw_text)

        # Extract graphical data
        graphical_data = extract_graphical_data(pdf_path, output_dir)

        # Extract tabular data
        tabular_data = extract_tabular_data(raw_text)

        # Print results
        print("\n[Text Data]:")
        print(text_data[:500] + "...")  # Show only the first 500 characters for preview
        print("\n[Graphical Data]:")
        print("\n".join(graphical_data))
        print("\n[Tabular Data]:")
        print("\n".join(tabular_data))

    except Exception as e:
        print(f"[ERROR] An error occurred during processing: {e}")


# Example Usage
pdf_path = "resistant_research_papers/2102.00836v2.pdf"  # Replace with your PDF path
output_dir = "output_graphs"  # Directory for saving images
os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist
divide_research_paper(pdf_path, output_dir)


AI Approach

In [63]:
from dotenv import load_dotenv
import PyPDF2
import os
import time
from langchain_core.messages import HumanMessage
from langchain_google_genai import GoogleGenerativeAI

# Load environment variables
load_dotenv()

print("Libraries loaded successfully")

# Initialize the LLM
llm = GoogleGenerativeAI(
    model="gemini-1.0-pro",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.3
)

def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract raw text from the PDF.
    """
    try:
        print(f"[DEBUG] Extracting text from PDF: {pdf_path}")
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
        print("[DEBUG] Text extraction successful.")
        return text
    except Exception as e:
        print(f"[ERROR] Failed to extract text: {e}")
        return ""

def process_with_ai(raw_text: str) -> dict:
    """
    Use the LLM to divide the text into Text Data, Graphical Data, and Tabular Data.
    """
    try:
        print("[DEBUG] Sending text to AI for processing...")
        
        # AI prompt to divide the data
        prompt = (
            "You are an assistant for processing research papers. Divide the content into three sections: "
            "1. **Text Data**: Remove references, reference numbers, and irrelevant content, keeping only the main body text. "
            "2. **Graphical Data**: List the names or captions of any figures, images, or graphs found in the content. "
            "3. **Tabular Data**: Extract and organize any tabular data into a structured format. "
            f"Here is the text from the research paper:\n\n{raw_text}"
        )
        
        # Send the prompt to the LLM
        response = llm.predict_messages([HumanMessage(content=prompt)])
        print("[DEBUG] AI response received.")
        
        return response.content
    except Exception as e:
        print(f"[ERROR] Failed to process text with AI: {e}")
        return {"Text Data": "", "Graphical Data": "", "Tabular Data": ""}

def divide_research_paper_with_ai(pdf_path: str):
    """
    Main function to divide the research paper using AI into Text Data, Graphical Data, and Tabular Data.
    """
    try:
        print(f"[DEBUG] Processing research paper: {pdf_path}")
        
        # Step 1: Extract raw text from PDF
        raw_text = extract_text_from_pdf(pdf_path)
        if not raw_text:
            print("[ERROR] No text data found in the PDF.")
            return

        # Step 2: Process the extracted text with AI
        processed_data = process_with_ai(raw_text)
        # Step 3: Display the results
        print("\n[Processed Data]:")
        print(processed_data)

    except Exception as e:
        print(f"[ERROR] An error occurred during processing: {e}")

# Example usage
pdf_path = "resistant_research_papers/2102.00836v2.pdf"  # Replace with your PDF path
divide_research_paper_with_ai(pdf_path)


Libraries loaded successfully
[DEBUG] Processing research paper: resistant_research_papers/2102.00836v2.pdf
[DEBUG] Extracting text from PDF: resistant_research_papers/2102.00836v2.pdf
[DEBUG] Text extraction successful.
Why exercise builds muscles: Titin mechanosensing controls skeletal muscle growth under load
Neil Ibata and Eugene M. Terentjev
(Dated: May 6, 2021)
Muscles sense internally generated and externally applied forces, responding to these in a coordinated hier-
archical manner at different time scales. The center of the basic unit of the muscle, the sarcomeric M-band, is
perfectly placed to sense the different types of load to which the muscle is subjected. In particular, the kinase
domain (TK) of titin located at the M-band is a known candidate for mechanical signaling. Here, we develop
the quantitative mathematical model that describes the kinetics of TK-based mechanosensitive signaling, and
predicts trophic changes in response to exercise and rehabilitation regimes. Fi

In [None]:
from google.cloud import documentai_v1beta3 as documentai
import os


def process_document_ai(file_path: str, project_id: str, location: str, processor_id: str):
    """Process a document using Google Document AI's OCR Processor."""
    try:
        # Initialize the Document AI client
        client = documentai.DocumentProcessorServiceClient()

        # Processor endpoint and other details
        endpoint = f"https://{location}-documentai.googleapis.com/v1/projects/{project_id}/locations/{location}/processors/{processor_id}:process"

        # Read the file content
        with open(file_path, "rb") as f:
            content = f.read()

        # Create the request payload
        document = {"content": content, "mime_type": "application/pdf"}
        request = {
            "name": endpoint,
            "raw_document": document
        }

        # Call the API
        response = client.raw_document(request)

        # Extract text from the response
        extracted_text = response.document.text
        print(f"Extracted Text:\n{extracted_text}")
        return extracted_text

    except Exception as e:
        print(f"[ERROR] Failed to process document: {e}")
        return None


# Example usage
file_path = "resistant_research_papers/2102.00836v2.pdf"  # Update with your file path
project_id = "543798683069"  # Replace with your project ID
location = "us"  # Your processor location
processor_id = "a370be5d003f980f"  # Replace with your processor ID

process_document_ai(file_path, project_id, location, processor_id)


[ERROR] Failed to process document: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.


In [None]:
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import os
import uuid
from typing import List, Dict, Union, Optional
from dotenv import load_dotenv
from rich import print
import PyPDF2

load_dotenv()


# Initialize the LLM
llm = GoogleGenerativeAI(
    model="gemini-1.0-pro",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.1
)

class AgenticChunker:
    def __init__(self):
        load_dotenv()
        """Initialize the AgenticChunker with configuration."""
        self.chunks: Dict[str, Dict] = {}
        self.id_truncate_limit = 5
        self.generate_new_metadata_ind = True
        self.print_logging = True
        
        # Initialize the model
        self.llm = GoogleGenerativeAI(
            model="gemini-1.0-pro",
            google_api_key=os.getenv("GOOGLE_API_KEY"),
            temperature=0.1
        )

    def add_propositions(self, propositions: List[str]) -> None:
        """Add multiple propositions to chunks."""
        for proposition in propositions:
            if proposition and isinstance(proposition, str):
                self.add_proposition(proposition.strip())

    def add_proposition(self, proposition: str) -> None:
        """Add a single proposition to an existing or new chunk."""
        if not proposition:
            return

        if self.print_logging:
            print(f"\nAdding: '{proposition}'")

        if not self.chunks:
            self._create_new_chunk(proposition)
            return

        chunk_id = self._find_relevant_chunk(proposition)
        if chunk_id:
            self.add_proposition_to_chunk(chunk_id, proposition)
        else:
            self._create_new_chunk(proposition)

    def _create_new_chunk(self, proposition: str) -> None:
        """Create a new chunk with the given proposition."""
        try:
            new_chunk_id = str(uuid.uuid4())[:self.id_truncate_limit]
            new_chunk_summary = self._get_new_chunk_summary(proposition)
            new_chunk_title = self._get_new_chunk_title(new_chunk_summary)

            self.chunks[new_chunk_id] = {
                'chunk_id': new_chunk_id,
                'propositions': [proposition],
                'title': new_chunk_title,
                'summary': new_chunk_summary,
                'chunk_index': len(self.chunks),
                'metadata': self._generate_metadata(proposition)
            }

            if self.print_logging:
                print(f"Created new chunk ({new_chunk_id}): {new_chunk_title}")

        except Exception as e:
            print(f"Error creating new chunk: {e}")
            raise

    def _get_new_chunk_summary(self, proposition: str) -> str:
        """Generate a summary for a new chunk."""
        try:
            prompt = ChatPromptTemplate.from_messages([
                ("system", "Generate a concise summary of the following scientific proposition, "
                          "focusing on key findings and implications."),
                ("user", f"Proposition: {proposition}")
            ])
            
            response = self.llm.invoke(prompt.format_prompt().to_messages())
            return response.content if response.content else "No summary generated"
            
        except Exception as e:
            print(f"Error generating summary: {e}")
            return "Summary generation failed"

    def _get_new_chunk_title(self, summary: str) -> str:
        """Generate a title for a chunk based on its summary."""
        try:
            prompt = ChatPromptTemplate.from_messages([
                ("system", "Generate a brief, descriptive title (5-7 words) for this research summary."),
                ("user", f"Summary: {summary}")
            ])
            
            response = self.llm.invoke(prompt.format_prompt().to_messages())
            return response.content if response.content else "Untitled Chunk"
            
        except Exception as e:
            print(f"Error generating title: {e}")
            return "Untitled Chunk"

    def _generate_metadata(self, proposition: str) -> Dict:
        """Generate metadata for a chunk based on its content."""
        return {
            'source': 'research_paper',
            'content_type': 'scientific_finding',
            'creation_time': str(uuid.uuid1()),
        }

    def _find_relevant_chunk(self, proposition: str) -> Optional[str]:
        """Find the most relevant existing chunk for a proposition."""
        try:
            if not self.chunks:
                return None

            # Compare proposition with existing chunks
            for chunk_id, chunk in self.chunks.items():
                existing_props = ' '.join(chunk['propositions'])
                
                # Create a prompt to check relevance
                prompt = ChatPromptTemplate.from_messages([
                    ("system", "Determine if these statements are closely related (yes/no)."),
                    ("user", f"Statement 1: {existing_props}\nStatement 2: {proposition}")
                ])
                
                response = self.llm.invoke(prompt.format_prompt().to_messages())
                
                if response.content and 'yes' in response.content.lower():
                    return chunk_id
                    
            return None
            
        except Exception as e:
            print(f"Error finding relevant chunk: {e}")
            return None

    def get_chunks(self, get_type: str = 'dict') -> Union[Dict, List[str]]:
        """Get chunks in the specified format."""
        if get_type == 'dict':
            return self.chunks
        elif get_type == 'list_of_strings':
            return [" ".join(chunk['propositions']) for chunk in self.chunks.values()]
        else:
            raise ValueError("Invalid get_type. Use 'dict' or 'list_of_strings'")

    def save_chunks_to_file(self, file_path: str) -> None:
        """Save chunks to a file with error handling."""
        try:
            with open(file_path, 'w', encoding='utf-8') as file:
                for chunk_id, chunk in self.chunks.items():
                    file.write(f"Chunk ID: {chunk_id}\n")
                    file.write(f"Title: {chunk['title']}\n")
                    file.write(f"Summary: {chunk['summary']}\n")
                    file.write("Propositions:\n")
                    for prop in chunk['propositions']:
                        file.write(f"- {prop}\n")
                    file.write(f"Metadata: {chunk['metadata']}\n")
                    file.write("\n---\n\n")
                    
        except Exception as e:
            print(f"Error saving chunks to file: {e}")
            raise


# Function to extract text from the PDF and divide it into sections
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract raw text from the PDF.
    """
    try:
        print(f"[DEBUG] Extracting text from PDF: {pdf_path}")
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
        print("[DEBUG] Text extraction successful.")
        return text
    except Exception as e:
        print(f"[ERROR] Failed to extract text: {e}")
        return ""


# Function to process the extracted text using AgenticChunker
def process_with_agentic_chunking(text: str) -> Dict:
    """
    Process the extracted text using AgenticChunker and categorize the data.
    """
    print("[DEBUG] Processing extracted text into sections...")

    # Initialize AgenticChunker and add propositions
    chunker = AgenticChunker()
    chunker.add_propositions([text])  # Assuming the text is passed as a single proposition

    # Get the chunks from AgenticChunker
    chunks = chunker.get_chunks(get_type='dict')
    
    # Divide into text, graphical, and tabular data
    sections = {
        "Text Data": "",
        "Graphical Data": "",
        "Tabular Data": ""
    }

    for chunk_id, chunk in chunks.items():
        # Assuming the chunk summary helps identify which section it belongs to
        if "figure" in chunk['title'].lower() or "graph" in chunk['summary'].lower():
            sections["Graphical Data"] += chunk['propositions'][0] + "\n"
        elif "table" in chunk['title'].lower():
            sections["Tabular Data"] += chunk['propositions'][0] + "\n"
        else:
            sections["Text Data"] += chunk['propositions'][0] + "\n"

    return sections


# Main function to divide the research paper into sections
def divide_research_paper(pdf_path: str):
    """
    Main function to extract and categorize the research paper.
    """
    try:
        print(f"[DEBUG] Processing research paper: {pdf_path}")

        # Step 1: Extract raw text from PDF
        raw_text = extract_text_from_pdf(pdf_path)
        if not raw_text:
            print("[ERROR] No text data found in the PDF.")
            return

        # Step 2: Process the text into sections (Text, Graphical, Tabular)
        sections = process_with_agentic_chunking(raw_text)

        # Step 3: Display the results
        print("\n[Sections]:")
        print("Text Data:", sections["Text Data"])
        print("Graphical Data:", sections["Graphical Data"])
        print("Tabular Data:", sections["Tabular Data"])

    except Exception as e:
        print(f"[ERROR] An error occurred during processing: {e}")


# Example usage
pdf_path = "resistant_research_papers/2102.00836v2.pdf"  # Replace with your PDF path
divide_research_paper(pdf_path)


In [71]:
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import os
import uuid
from typing import List, Dict, Union, Optional
from dotenv import load_dotenv
from rich import print
import re
from PyPDF2 import PdfReader

load_dotenv()

class AgenticChunker:
    def __init__(self):
        """Initialize the AgenticChunker with configuration."""
        self.chunks: Dict[str, Dict] = {}
        self.id_truncate_limit = 5
        self.generate_new_metadata_ind = True
        self.print_logging = True

        # Initialize the model
        self.llm = GoogleGenerativeAI(
            model="gemini-1.0-pro",
            google_api_key=os.getenv("GOOGLE_API_KEY"),
            temperature=0.1
        )

    def add_propositions(self, propositions: List[str]) -> None:
        """Add multiple propositions to chunks."""
        for proposition in propositions:
            if proposition and isinstance(proposition, str):
                self.add_proposition(proposition.strip())

    def add_proposition(self, proposition: str) -> None:
        """Add a single proposition to an existing or new chunk."""
        if not proposition:
            return

        if self.print_logging:
            print(f"\nAdding: '{proposition}'")

        if not self.chunks:
            self._create_new_chunk(proposition)
            return

        chunk_id = self._find_relevant_chunk(proposition)
        if chunk_id:
            self.add_proposition_to_chunk(chunk_id, proposition)
        else:
            self._create_new_chunk(proposition)

    def _create_new_chunk(self, proposition: str) -> None:
        """Create a new chunk with the given proposition."""
        try:
            new_chunk_id = str(uuid.uuid4())[:self.id_truncate_limit]
            new_chunk_summary = self._get_new_chunk_summary(proposition)
            new_chunk_title = self._get_new_chunk_title(new_chunk_summary)

            self.chunks[new_chunk_id] = {
                'chunk_id': new_chunk_id,
                'propositions': [proposition],
                'title': new_chunk_title,
                'summary': new_chunk_summary,
                'chunk_index': len(self.chunks),
                'metadata': self._generate_metadata(proposition)
            }

            if self.print_logging:
                print(f"Created new chunk ({new_chunk_id}): {new_chunk_title}")

        except Exception as e:
            print(f"Error creating new chunk: {e}")
            raise

    def _get_new_chunk_summary(self, proposition: str) -> str:
        """Generate a summary for a new chunk."""
        try:
            prompt = ChatPromptTemplate.from_messages([
                ("system", "Process the following text data, clean it by removing references (e.g., [1]), tables, and irrelevant sections, and categorize the data into three sections: Text Data, Graphical Data, and Tabular Data."),
                ("user", f"Text: {proposition}")
            ])
            
            response = self.llm.invoke(prompt.format_prompt().to_messages())
            return response.get('content', 'No summary generated')
            
        except Exception as e:
            print(f"Error generating summary: {e}")
            return "Summary generation failed"

    def _get_new_chunk_title(self, summary: str) -> str:
        """Generate a title for a chunk based on its summary."""
        try:
            prompt = ChatPromptTemplate.from_messages([
                ("system", "Generate a brief, descriptive title (5-7 words) for this research summary."),
                ("user", f"Summary: {summary}")
            ])
            
            response = self.llm.invoke(prompt.format_prompt().to_messages())
            return response.get('content', "Untitled Chunk")
            
        except Exception as e:
            print(f"Error generating title: {e}")
            return "Untitled Chunk"

    def _generate_metadata(self, proposition: str) -> Dict:
        """Generate metadata for a chunk based on its content."""
        return {
            'source': 'research_paper',
            'content_type': 'scientific_finding',
            'creation_time': str(uuid.uuid1()),
        }

    def _find_relevant_chunk(self, proposition: str) -> Optional[str]:
        """Find the most relevant existing chunk for a proposition."""
        try:
            if not self.chunks:
                return None

            # Compare proposition with existing chunks
            for chunk_id, chunk in self.chunks.items():
                existing_props = ' '.join(chunk['propositions'])
                
                # Create a prompt to check relevance
                prompt = ChatPromptTemplate.from_messages([
                    ("system", "Determine if these statements are closely related (yes/no)."),
                    ("user", f"Statement 1: {existing_props}\nStatement 2: {proposition}")
                ])
                
                response = self.llm.invoke(prompt.format_prompt().to_messages())
                
                if response.get('content', '').lower() == 'yes':
                    return chunk_id
                    
            return None
            
        except Exception as e:
            print(f"Error finding relevant chunk: {e}")
            return None

    def get_chunks(self, get_type: str = 'dict') -> Union[Dict, List[str]]:
        """Get chunks in the specified format."""
        if get_type == 'dict':
            return self.chunks
        elif get_type == 'list_of_strings':
            return [" ".join(chunk['propositions']) for chunk in self.chunks.values()]
        else:
            raise ValueError("Invalid get_type. Use 'dict' or 'list_of_strings'")

    def save_chunks_to_file(self, file_path: str) -> None:
        """Save chunks to a file with error handling."""
        try:
            with open(file_path, 'w', encoding='utf-8') as file:
                for chunk_id, chunk in self.chunks.items():
                    file.write(f"Chunk ID: {chunk_id}\n")
                    file.write(f"Title: {chunk['title']}\n")
                    file.write(f"Summary: {chunk['summary']}\n")
                    file.write("Propositions:\n")
                    for prop in chunk['propositions']:
                        file.write(f"- {prop}\n")
                    file.write(f"Metadata: {chunk['metadata']}\n")
                    file.write("\n---\n\n")
                    
        except Exception as e:
            print(f"Error saving chunks to file: {e}")
            raise

    def extract_and_process_pdf(self, pdf_file_path: str) -> Dict[str, Union[str, List[str]]]:
        """Extract text and categorize into sections: Text, Graphical, and Tabular."""
        try:
            # Extract raw text
            text = self.extract_text_from_pdf(pdf_file_path)
            
            # Process the text with chunking
            self.add_propositions([text])  # Add extracted text as a proposition
            
            chunks = self.get_chunks(get_type='dict')
            sections = self.categorize_chunks(chunks)

            return sections

        except Exception as e:
            print(f"Error processing PDF: {e}")
            return {}

    def extract_text_from_pdf(self, pdf_file_path: str) -> str:
        """Extract raw text from the PDF."""
        try:
            with open(pdf_file_path, 'rb') as file:
                reader = PdfReader(file)
                full_text = ""
                for page in reader.pages:
                    full_text += page.extract_text() or ""
            return full_text
        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            return ""

    def categorize_chunks(self, chunks: Dict) -> Dict[str, str]:
        """Categorize chunks into Text, Graphical, and Tabular data."""
        sections = {
            "Text Data": "",
            "Graphical Data": "",
            "Tabular Data": ""
        }

        for chunk_id, chunk in chunks.items():
            if "figure" in chunk['title'].lower() or "graph" in chunk['summary'].lower():
                sections["Graphical Data"] += chunk['propositions'][0] + "\n"
            elif "table" in chunk['title'].lower():
                sections["Tabular Data"] += chunk['propositions'][0] + "\n"
            else:
                sections["Text Data"] += chunk['propositions'][0] + "\n"

        return sections


# Main function to divide the research paper into sections
def divide_research_paper(pdf_path: str):
    """Main function to extract and categorize the research paper."""
    try:
        print(f"[DEBUG] Processing research paper: {pdf_path}")

        # Initialize AgenticChunker and process the paper
        chunker = AgenticChunker()
        sections = chunker.extract_and_process_pdf(pdf_path)

        # Display the results
        print("\n[Sections]:")
        print("Text Data:", sections["Text Data"])
        print("Graphical Data:", sections["Graphical Data"])
        print("Tabular Data:", sections["Tabular Data"])

    except Exception as e:
        print(f"[ERROR] An error occurred during processing: {e}")


# Example usage
pdf_path = "resistant_research_papers/2102.00836v2.pdf"  # Replace with your PDF path
divide_research_paper(pdf_path)
