In [4]:
import os
import re
import fitz  # PyMuPDF for PDFs
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    print(f"Text extracted from {pdf_path}")
    return text

def clean_text(text):
    # Remove References section (if applicable)
    text = re.sub(r'References.*', '', text, flags=re.DOTALL)

    # Remove headers/footers (example pattern, adjust as needed)
    text = re.sub(r'Header text pattern.*', '', text, flags=re.MULTILINE)
    text = re.sub(r'Footer text pattern.*', '', text, flags=re.MULTILINE)

    # Remove non-alphanumeric characters (if necessary) and extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s.,?!:;\'"-]', '', text)
    text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces
    text = text.strip()
    
    # Optional: Convert to lowercase to standardize
    text = text.lower()
    
    return text

def chunk_text(text, chunk_size=500):
    # Process the text with SpaCy
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    chunks = []
    chunk = ""

    for sentence in sentences:
        if len(chunk) + len(sentence) > chunk_size:
            chunks.append(chunk)
            chunk = sentence
        else:
            chunk += " " + sentence

    if chunk:
        chunks.append(chunk)
    print(f"Chunks extracted")
    return chunks

def process_files_in_folder(folder_path):
    combined_text = ""
    
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        print(f"{file_name} is under processing...")
        if file_name.endswith(".pdf"):
            raw_text = extract_text_from_pdf(file_path)
        else:
            continue  # Skip non-supported file types
        
        cleaned_text = clean_text(raw_text)
        chunks = chunk_text(cleaned_text)
        
        # Combine all chunks into one text (can save to individual files or a combined file)
        combined_text += "\n".join(chunks) + "\n"
    
    return combined_text

# Specify the folder containing your research files
folder_path = "resistant_research_papers"

# Process the files in the folder and get the combined cleaned text
combined_cleaned_text = process_files_in_folder(folder_path)

# Save the combined cleaned and chunked text to a file
output_file = "resistant.txt"
with open(output_file, 'w', encoding='utf-8') as file:
    file.write(combined_cleaned_text)

print(f"Combined text has been saved to {output_file}")

ijerph-16-04897.pdf is under processing...
Text extracted from resistant_research_papers/ijerph-16-04897.pdf
Chunks extracted
msse-53-1206.pdf is under processing...
Text extracted from resistant_research_papers/msse-53-1206.pdf
Chunks extracted
2102.00836v2.pdf is under processing...
Text extracted from resistant_research_papers/2102.00836v2.pdf
Chunks extracted
fphys-12-791999.pdf is under processing...
Text extracted from resistant_research_papers/fphys-12-791999.pdf
Chunks extracted
fspor-04-949021.pdf is under processing...
Text extracted from resistant_research_papers/fspor-04-949021.pdf
Chunks extracted
jfmk-09-00009.pdf is under processing...
Text extracted from resistant_research_papers/jfmk-09-00009.pdf
Chunks extracted
Combined text has been saved to resistant.txt


Using Gemini for advanced agentic chunking:

1. Attempting Section based chunking/ seperating

    Problems:
    
        a. Loss of data

In [None]:
from langchain import hub
from langchain_core.pydantic_v1 import BaseModel, Field
from agentic_chunker import AgenticChunker
from langchain_core.messages import HumanMessage
from langchain_google_genai import GoogleGenerativeAI
from dotenv import load_dotenv
import PyPDF2
import os
from typing import List

# Load environment variables
load_dotenv()

print("Libraries loaded successfully")

# Initialize Gemini client
llm = GoogleGenerativeAI(
    model="gemini-1.0-pro",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.3
)

def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract text from a PDF file using PyPDF2.
    """
    try:
        print(f"[DEBUG] Attempting to extract text from PDF: {pdf_path}")
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
        print("[DEBUG] Text extraction successful.")
        return text
    except Exception as e:
        print(f"[ERROR] Failed to extract text from PDF: {e}")
        return ""

def create_prompt(text: str) -> str:
    """
    Create the agentic chunking prompt for research papers.
    """
    print("[DEBUG] Creating the agentic chunking prompt...")
    prompt = f"""
    You are an AI model trained to analyze and extract entire data from scientific research papers.
    Your task is to extract and organize the content of the provided paper into the following sections:

    1. Objectives: Clearly describe the purpose or aim of the study.
    2. Methods: Summarize the methodology, including design, participants, and analysis techniques.
    3. Results: Provide key findings or outcomes of the study.
    4. Discussion: Highlight the relevance and implications of the findings.
    5. Conclusion: Summarize the main takeaways.
    6. Practical Applications: Explain how the findings can be applied in real-world contexts.
    7. References: Automatically detect and include the references or citations for this research paper.

    Include all text from the paper in the response, ensuring no details are omitted. 

    Here is the content of the research paper:

    {text}
    """
    print("[DEBUG] Prompt successfully created.")
    return prompt

def parse_response(response: str) -> dict:
    """
    Dynamically parse the response into structured sections.
    """
    print("[DEBUG] Parsing response from the LLM...")
    sections = {}
    current_section = None

    for line in response.splitlines():
        line = line.strip()
        if line.startswith("**") and line.endswith(":**"):
            # Detect section headers like "**Objectives:**"
            current_section = line.strip("*:").strip()
            sections[current_section] = ""
        elif current_section:
            # Append content to the current section
            sections[current_section] += line + " "

    print("[DEBUG] Parsing complete.")
    return {k: v.strip() for k, v in sections.items()}

def process_research_paper(pdf_path: str):
    """
    Process a research paper PDF file and extract structured information.
    """
    try:
        # Step 1: Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        if not text:
            print("[ERROR] No text extracted. Exiting...")
            return

        # Step 2: Create the prompt
        prompt = create_prompt(text)

        # Step 3: Use the LLM to process the prompt
        print("[DEBUG] Sending prompt to the LLM...")
        message = HumanMessage(content=prompt)
        response = llm.invoke([message])
        print(f"[DEBUG] Response received:\n{response}")

        # Step 4: Parse the response into structured sections
        sections = parse_response(response)
        print("[DEBUG] Structured sections extracted.")

        # Step 5: Display the extracted information
        print("\nExtracted Information:")
        for section, content in sections.items():
            print(f"\n**{section}:**")
            print(content if content else "No information extracted.")


    except Exception as e:
        print(f"[ERROR] An error occurred: {e}")

# Example Usage
pdf_path = "resistant_research_papers/2102.00836v2.pdf"  # Path to your PDF
process_research_paper(pdf_path)


In [58]:
from langchain import hub
from langchain_core.pydantic_v1 import BaseModel, Field
from agentic_chunker import AgenticChunker
from langchain_core.messages import HumanMessage
from langchain_google_genai import GoogleGenerativeAI
from dotenv import load_dotenv
import PyPDF2
import os

# Load environment variables
load_dotenv()

print("Libraries loaded successfully")

# Initialize Gemini client
llm = GoogleGenerativeAI(
    model="gemini-1.0-pro",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.3
)

def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extract text from a PDF file using PyPDF2.
    """
    try:
        print(f"[DEBUG] Attempting to extract text from PDF: {pdf_path}")
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
        print("[DEBUG] Text extraction successful.")
        return text
    except Exception as e:
        print(f"[ERROR] Failed to extract text from PDF: {e}")
        return ""

def create_chunking_prompt(text: str) -> str:
    """
    Create a prompt for chunking the text into meaningful parts.
    """
    print("[DEBUG] Creating the chunking prompt...")
    prompt = f"""
    You are an AI assistant skilled in analyzing and chunking research paper text. Your task is to break the text into meaningful chunks. 
    Ensure the chunks has the original text from the paper. NO summarization; focus on grouping related sentences together.
    Remove any reference number added in the sentences.
    Here is the content of the research paper to chunk, do not create your own text, rather grough the text below together which have sementic similarities:

    {text}
    """
    print("[DEBUG] Prompt successfully created.")
    return prompt

def parse_chunking_response(response: str) -> list:
    """
    Parse the chunking response into a list of chunks.
    """
    print("[DEBUG] Parsing the chunked response...")
    chunks = response.split("\n\n")
    parsed_chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
    print("[DEBUG] Parsing complete.")
    return parsed_chunks

def process_research_paper_with_chunking(pdf_path: str):
    """
    Process a research paper PDF file and chunk its content into meaningful parts.
    """
    try:
        # Step 1: Extract text from the PDF
        text = extract_text_from_pdf(pdf_path)
        if not text:
            print("[ERROR] No text extracted. Exiting...")
            return

        # Step 2: Create the chunking prompt
        prompt = create_chunking_prompt(text)

        # Step 3: Use the LLM to process the prompt
        print("[DEBUG] Sending prompt to the LLM...")
        message = HumanMessage(content=prompt)
        response = llm.invoke([message])
        print(f"[DEBUG] Response received:\n{response}")

        # Step 4: Parse the response into chunks
        chunks = parse_chunking_response(response)
        print("[DEBUG] Chunking complete. Total chunks extracted:", len(chunks))

        # Step 5: Display the extracted chunks
        print("\nExtracted Chunks:")
        for i, chunk in enumerate(chunks, 1):
            print(f"\n[Chunk {i}]:")
            print(chunk)

    except Exception as e:
        print(f"[ERROR] An error occurred: {e}")

# Example Usage
pdf_path = "resistant_research_papers/2102.00836v2.pdf"  # Path to your PDF
process_research_paper_with_chunking(pdf_path)


Libraries loaded successfully
[DEBUG] Attempting to extract text from PDF: resistant_research_papers/2102.00836v2.pdf
[DEBUG] Text extraction successful.
[DEBUG] Creating the chunking prompt...
[DEBUG] Prompt successfully created.
[DEBUG] Sending prompt to the LLM...
[DEBUG] Response received:
**Chunk 1:**
- Introduces the topic of muscle growth and hypertrophy
- Explains the role of titin mechanosensing in muscle growth
- Provides an overview of the model developed by the authors

**Chunk 2:**
- Describes the structure and function of titin kinase (TK)
- Explains how TK opens under force and can be phosphorylated
- Introduces the concept of a metastable mechanosensitive switch

**Chunk 3:**
- Discusses the role of signaling molecules in muscle growth
- Explains how the model incorporates the activation of signaling molecules
- Introduces the concept of ribosome biogenesis and its role in muscle growth

**Chunk 4:**
- Explains how the model incorporates the feedback loop between muscle