Agentic Segmentation:

In [None]:
import os
import json
import re
import PyPDF2
from langchain_google_genai import GoogleGenerativeAI

def initialize_model():
    model = GoogleGenerativeAI(
        model="gemini-1.0-pro",
        google_api_key=os.getenv("GOOGLE_API_KEY"),
        temperature=0.1
    )
    return model

def clean_response(response_str):
    """
    Cleans the response string by removing the code block markers and then attempts to convert it to JSON.
    """
    # Remove the code block markers (start and end)
    response_str = re.sub(r'^```json\n', '', response_str)
    response_str = re.sub(r'```$', '', response_str).strip()
    
    # Attempt to parse the cleaned string into a JSON object
    try:
        response_json = json.loads(response_str)
        return response_json
    except json.JSONDecodeError:
        print("Error parsing the response as JSON, response was:", response_str)
        return {"findings": [], "metadata": {}}

def chunk_and_clean_text(model, raw_text):
    prompt = """
    You are a highly capable AI model tasked with cleaning and chunking the provided text.
    Please return the response in JSON format with two keys:
    - "findings": A list of valid claims or facts related to muscle training, nutrition, gym, biology, etc.
    - "metadata": A dictionary containing the "title" key with the paper's title.
    Here is the input text: {raw_text}
    """

    # Get response from Gemini model (in string format)
    response_str = model(prompt.format(raw_text=raw_text))
        
    # Clean and parse the response string into a JSON object
    response_json = clean_response(response_str)
    
    findings = response_json.get("findings", [])
    metadata = response_json.get("metadata", {})
    
    return {"findings": findings, "metadata": metadata}

def process_pages(pages):
    model = initialize_model()
    full_response = {"findings": [], "metadata": {}}
    
    for page in pages:
        print(f"Processing page {pages.index(page) + 1}...")
        response = chunk_and_clean_text(model, page)
        
        # Print the response for debugging purposes
        print("Response:", response)  # Print the response to verify it's in the correct format
        
        # Merge findings from the response
        if isinstance(response, dict):
            # Append findings to full_response['findings']
            full_response["findings"].extend(response.get("findings", []))
            
            # Merge metadata if it's not already set
            if not full_response["metadata"]:
                full_response["metadata"] = response.get("metadata", {})
        else:
            print("Response is not in the expected format:", response)
    
    return full_response

# Extract text from PDF using PyPDF2
def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        pages = []
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            pages.append(page_text)
    return pages


# Path to your PDF file
file_path = "nutrition_research_papers/nutrients-11-01136.pdf"
# Extract text from the PDF using PyPDF2
print("Extracting text from PDF using PyPDF2...")
pages = extract_text_from_pdf(file_path)
print(len(pages))
# Perform chunking and cleaning
print("Cleaning and chunking text from each page...")
final_response = process_pages(pages)

# Output the final response
print(final_response)
