In [1]:
from google.cloud import aiplatform
import os

gemini = os.getenv("GEMINI_API_KEY")
location = os.getenv("location")
location_processor = os.getenv("location_processor")
project_id = os.getenv("project_id")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= 'preprocessing_credentials.json'

aiplatform.init(project=project_id, location=location)

Agentic Segmentation:

Testing on 1 file:

In [None]:
import os
import json
import re
import PyPDF2
from langchain_google_genai import GoogleGenerativeAI

# Get requirements

gemini = os.getenv("GEMINI_API_KEY")
location = os.getenv("location")
location_processor = os.getenv("location_processor")
project_id = os.getenv("project_id")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= 'preprocessing_credentials.json'


def initialize_model():
    model = GoogleGenerativeAI(
        model="gemini-1.0-pro",
        google_api_key=gemini,
        temperature=0.1
    )
    return model

def clean_response(response_str):
    """
    Cleans the response string by removing the code block markers and then attempts to convert it to JSON.
    """
    # Remove the code block markers (start and end)
    response_str = re.sub(r'^```json\n', '', response_str)
    response_str = re.sub(r'```$', '', response_str).strip()
    
    # Attempt to parse the cleaned string into a JSON object
    try:
        response_json = json.loads(response_str)
        return response_json
    except json.JSONDecodeError:
        print("Error parsing the response as JSON, response was:", response_str)
        return {"findings": [], "metadata": {}}

def chunk_and_clean_text(model, raw_text):
    prompt = """
    You are a highly capable AI model tasked with cleaning and chunking the provided text.
    Please return the response in JSON format with two keys:
    - "findings": A list of valid claims or facts related to muscle training, nutrition, gym, biology, etc, this claims should be unique and shall not be a simple fact, it should be worth getting from a "secietific" research paper.
    - "metadata": A dictionary containing the "title" key with the paper's title.
    Here is the input text: {raw_text}
    """

    # Get response from Gemini model (in string format)
    response_str = model(prompt.format(raw_text=raw_text))
        
    # Clean and parse the response string into a JSON object
    response_json = clean_response(response_str)
    
    findings = response_json.get("findings", [])
    metadata = response_json.get("metadata", {})
    
    return {"findings": findings, "metadata": metadata}

def process_pages(pages):
    model = initialize_model()
    full_response = {"findings": [], "metadata": {}}
    
    for page in pages:
        print(f"Processing page {pages.index(page) + 1}...")
        response = chunk_and_clean_text(model, page)
        print(page)
        # Print the response for debugging purposes
        print("Response:", response)  # Print the response to verify it's in the correct format
        
        # Merge findings from the response
        if isinstance(response, dict):
            # Append findings to full_response['findings']
            full_response["findings"].extend(response.get("findings", []))
            
            # Merge metadata if it's not already set
            if not full_response["metadata"]:
                full_response["metadata"] = response.get("metadata", {})
        else:
            print("Response is not in the expected format:", response)
        break
    
    return full_response

# Extract text from PDF using PyPDF2
def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        pages = []
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            pages.append(page_text)
    return pages


# Path to your PDF file
file_path = "resistant_research_papers/2102.00836v2.pdf"
# Extract text from the PDF using PyPDF2
print("Extracting text from PDF using PyPDF2...")
pages = extract_text_from_pdf(file_path)
print(len(pages))
# Perform chunking and cleaning
print("Cleaning and chunking text from each page...")
final_response = process_pages(pages)

# Output the final response
print(final_response)


Applying the above to all the files in the folder:

In [None]:
import os
import json
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAI
import PyPDF2

# Initialize the Gemini Pro model
def initialize_model():
    try:
        gemini = os.getenv("GEMINI_API_KEY")
        model = GoogleGenerativeAI(
            model="gemini-2.0-flash-exp",
            google_api_key=gemini,
            temperature=0.2
        )
        print("Model initialized successfully.")
        return model
    except Exception as e:
        print(f"Error initializing model: {e}")
        exit(1)

def clean_response(response_str):
    # Remove code block markers if present
    response_str = re.sub(r'^```json\n', '', response_str)
    response_str = re.sub(r'```$', '', response_str).strip()
    
    # Ensure response ends with proper JSON structure
    if not response_str.endswith("}"):
        print("Warning: Response appears to be incomplete. Attempting to fix...")
        response_str += "}"
    
    try:
        return json.loads(response_str)
    except json.JSONDecodeError as e:
        print(f"Error parsing response as JSON: {response_str[:500]}")  # Log first 500 characters
        print(f"JSONDecodeError: {e}")
        return {"findings": []}


# Chunk text and send it to Gemini Pro
def chunk_and_clean_text(model, raw_text, title="Unknown Document"):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=4000,  # Adjust to Gemini's input limit
        chunk_overlap=200  # Overlap between chunks for context preservation
    )
    chunks = text_splitter.split_text(raw_text)
    combined_response = {"findings": []}

    for idx, chunk in enumerate(chunks):
        print(f"Processing chunk {idx + 1}/{len(chunks)}...")
        prompt = f"""
        You are a highly capable AI assisting with extracting relevant information.
        - Extract valid claims or facts related to muscle training, nutrition, gym, biology, etc.
        - Return the response as a **valid JSON object** having one key:
          - "findings": List of claims/facts that should be not too simple and hold very relevant info.
        Here is the input text: {chunk}
        """
        try:
            response_str = model(prompt)
            response = clean_response(response_str)
            combined_response["findings"].extend(response.get("findings", []))

        except Exception as e:
            print(f"Error processing chunk {idx + 1}: {e}")
    
    return combined_response

# Extract text from PDF
def extract_text_from_pdf(file_path):
    try:
        with open(file_path, "rb") as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            return [page.extract_text() for page in pdf_reader.pages]
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
        return []

# Process all PDFs in the specified folder
def process_folder(folder_path):
    model = initialize_model()
    aggregated_results = {"findings": []}
    failed_files = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".pdf"):
            print(f"\nProcessing file: {file_name}")
            file_path = os.path.join(folder_path, file_name)
            pages = extract_text_from_pdf(file_path)
            
            if not pages:
                print(f"Skipping {file_name}: No readable pages found.")
                failed_files.append(file_name)
                continue

            full_text = "\n".join(pages)
            file_response = chunk_and_clean_text(model, full_text, title=file_name)
            aggregated_results["findings"].extend(file_response.get("findings", []))
            print(f"Finished processing {file_name}.")
            print(f"Total findings so far: {len(aggregated_results['findings'])}")

    print("\nProcessing complete.")
    print(f"Failed to process {len(failed_files)} files: {failed_files}")
    return aggregated_results

# Main execution
folder_path = "nutrition_research_papers"  # Folder containing PDF files
print("Starting processing for folder:", folder_path)

final_results = process_folder(folder_path)
print(final_results)
# Format for embedding-friendly output
formatted_results = [
    {
        "id": idx + 1,
        "text": finding,
        "embedding": None,  # Placeholder; replace after embedding.
    }
    for idx, finding in enumerate(final_results["findings"])
]
print(formatted_results)
# Save the formatted results to JSON
output_file_path = "formatted_embeddings_input_nutrition.json"
with open(output_file_path, "w") as output_file:
    json.dump(formatted_results, output_file, indent=4)

print(f"\nResults saved to {output_file_path}.")
print(f"Total findings: {len(formatted_results)}")
