Agentic Segmentation:

In [None]:
import os
import json
import re
import PyPDF2
from langchain_google_genai import GoogleGenerativeAI

# Get requirements

gemini = os.getenv("GEMINI_API_KEY")
location = os.getenv("location")
location_processor = os.getenv("location_processor")
project_id = os.getenv("project_id")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= 'preprocessing_credentials.json'


def initialize_model():
    model = GoogleGenerativeAI(
        model="gemini-1.0-pro",
        google_api_key=gemini,
        temperature=0.1
    )
    return model

def clean_response(response_str):
    """
    Cleans the response string by removing the code block markers and then attempts to convert it to JSON.
    """
    # Remove the code block markers (start and end)
    response_str = re.sub(r'^```json\n', '', response_str)
    response_str = re.sub(r'```$', '', response_str).strip()
    
    # Attempt to parse the cleaned string into a JSON object
    try:
        response_json = json.loads(response_str)
        return response_json
    except json.JSONDecodeError:
        print("Error parsing the response as JSON, response was:", response_str)
        return {"findings": [], "metadata": {}}

def chunk_and_clean_text(model, raw_text):
    prompt = """
    You are a highly capable AI model tasked with cleaning and chunking the provided text.
    Please return the response in JSON format with two keys:
    - "findings": A list of valid claims or facts related to muscle training, nutrition, gym, biology, etc.
    - "metadata": A dictionary containing the "title" key with the paper's title.
    Here is the input text: {raw_text}
    """

    # Get response from Gemini model (in string format)
    response_str = model(prompt.format(raw_text=raw_text))
        
    # Clean and parse the response string into a JSON object
    response_json = clean_response(response_str)
    
    findings = response_json.get("findings", [])
    metadata = response_json.get("metadata", {})
    
    return {"findings": findings, "metadata": metadata}

def process_pages(pages):
    model = initialize_model()
    full_response = {"findings": [], "metadata": {}}
    
    for page in pages:
        print(f"Processing page {pages.index(page) + 1}...")
        response = chunk_and_clean_text(model, page)
        
        # Print the response for debugging purposes
        print("Response:", response)  # Print the response to verify it's in the correct format
        
        # Merge findings from the response
        if isinstance(response, dict):
            # Append findings to full_response['findings']
            full_response["findings"].extend(response.get("findings", []))
            
            # Merge metadata if it's not already set
            if not full_response["metadata"]:
                full_response["metadata"] = response.get("metadata", {})
        else:
            print("Response is not in the expected format:", response)
    
    return full_response

# Extract text from PDF using PyPDF2
def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        pages = []
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            pages.append(page_text)
    return pages


# Path to your PDF file
file_path = "nutrition_research_papers/jmir-2023-1-e37667.pdf"
# Extract text from the PDF using PyPDF2
print("Extracting text from PDF using PyPDF2...")
pages = extract_text_from_pdf(file_path)
print(len(pages))
# Perform chunking and cleaning
print("Cleaning and chunking text from each page...")
final_response = process_pages(pages)

# Output the final response
print(final_response)


Applying the above to all the files in the folder:

In [17]:
import os
import json
import re
import PyPDF2
from langchain_google_genai import GoogleGenerativeAI

# Initialize the Gemini Pro model
def initialize_model():
    try:
        gemini = os.getenv("GEMINI_API_KEY")
        model = GoogleGenerativeAI(
            model="gemini-2.0-flash-exp",
            google_api_key=gemini,
            temperature=0.2
        )
        print("Model initialized successfully.")
        return model
    except Exception as e:
        print(f"Error initializing model: {e}")
        exit(1)

# Clean the response from Gemini Pro
def clean_response(response_str):
    response_str = re.sub(r'^```json\n', '', response_str)
    response_str = re.sub(r'```$', '', response_str).strip()
    try:
        return json.loads(response_str)
    except json.JSONDecodeError:
        print(f"Error parsing response as JSON: {response_str[:500]}")  # Log first 500 characters
        return {"findings": [], "metadata": {}}

# Chunk text and send it to the Gemini Pro model
def chunk_and_clean_text(model, raw_text):
    max_length = 3000  # Adjust based on Gemini Pro's input limit
    truncated_text = raw_text[:max_length]
    
    prompt = f"""
    You are a highly capable AI model tasked with cleaning and chunking the provided text.
    Please return the response in JSON format with two keys:
    - "findings": A list of valid claims or facts related to muscle training, nutrition, gym, biology, etc.
    - "metadata": A dictionary containing the "title" key with the paper's title.
    Here is the input text: {truncated_text}
    """
    try:
        response_str = model(prompt)
        return clean_response(response_str)
    except Exception as e:
        print(f"Error processing text chunk: {e}")
        return {"findings": [], "metadata": {}}

# Process each page of a PDF
def process_pages(model, pages):
    combined_response = {"findings": [], "metadata": {}}
    for page_number, page in enumerate(pages, 1):
        print(f"Processing page {page_number}...")
        response = chunk_and_clean_text(model, page)
        combined_response["findings"].extend(response.get("findings", []))
        if not combined_response["metadata"]:
            combined_response["metadata"] = response.get("metadata", {})
        print(f"Page {page_number} findings: {len(response.get('findings', []))}")
    return combined_response

# Extract text from PDF
def extract_text_from_pdf(file_path):
    try:
        with open(file_path, "rb") as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            return [page.extract_text() for page in pdf_reader.pages]
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
        return []

# Process all PDFs in the specified folder
def process_folder(folder_path):
    model = initialize_model()
    aggregated_results = {"findings": [], "metadata": []}
    failed_files = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith(".pdf"):
            print(f"\nProcessing file: {file_name}")
            file_path = os.path.join(folder_path, file_name)
            pages = extract_text_from_pdf(file_path)
            
            if not pages:
                print(f"Skipping {file_name}: No readable pages found.")
                failed_files.append(file_name)
                continue

            file_response = process_pages(model, pages)
            aggregated_results["findings"].extend(file_response.get("findings", []))
            aggregated_results["metadata"].append({
                "file_name": file_name,
                **file_response.get("metadata", {})
            })
            print(f"Finished processing {file_name}. Metadata: {file_response.get('metadata', {})}")
            print(f"Total findings so far: {len(aggregated_results['findings'])}")

    print("\nProcessing complete.")
    print(f"Failed to process {len(failed_files)} files: {failed_files}")
    return aggregated_results

# Main execution
if __name__ == "__main__":
    folder_path = "nutrition_research_papers"  # Folder containing PDF files
    print("Starting processing for folder:", folder_path)

    final_results = process_folder(folder_path)

    # Save the aggregated results to a JSON file
    output_file_path = "aggregated_results.json"
    with open(output_file_path, "w") as output_file:
        json.dump(final_results, output_file, indent=4)
    
    print(f"\nResults saved to {output_file_path}.")
    print(f"Total findings: {len(final_results['findings'])}")
    print(f"Metadata collected for {len(final_results['metadata'])} files.")


Page 4 findings: 12
Processing page 5...


Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Page 5 findings: 15
Processing page 6...


Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 8.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


Page 6 findings: 14
Processing page 7...
Page 7 findings: 14
Processing page 8...
Page 8 findings: 15
Processing page 9...
Page 9 findings: 16
Processing page 10...
Page 10 findings: 11
Processing page 11...
Page 11 findings: 14
Processing page 12...
Page 12 findings: 25
Processing page 13...
Page 13 findings: 28
Processing page 14...
Page 14 findings: 17
Processing page 15...
Page 15 findings: 35
Processing page 16...
Page 16 findings: 40
Processing page 17...
Page 17 findings: 5
Processing page 18...
Page 18 findings: 17
Processing page 19...
Page 19 findings: 13
Processing page 20...
Page 20 findings: 13
Processing page 21...
Page 21 findings: 10
Processing page 22...
Page 22 findings: 9
Processing page 23...
Page 23 findings: 11
Processing page 24...
Page 24 findings: 8
Processing page 25...
Page 25 findings: 11
Processing page 26...
Page 26 findings: 12
Processing page 27...
Page 27 findings: 11
Processing page 28...
Page 28 findings: 8
Finished processing nutrients-13-01763.pdf. 

Gathering the embedding for all the files and creating the vector db info:

In [None]:
import os
import json
import re
import PyPDF2
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAI
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from google.cloud import aiplatform
from datetime import datetime
import numpy as np

# Load environment variables
load_dotenv()
gemini_api_key = os.getenv("GOOGLE_API_KEY")
aiplatform_project = os.getenv("PROJECT_ID")
aiplatform_location = os.getenv("LOCATION_ID")
processor_id = os.getenv("PROCESSOR_ID")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "preprocessing_credentials.json"

# Initialize Vertex AI
aiplatform.init(project=aiplatform_project, location=aiplatform_location)

# Function to initialize Gemini model
def initialize_model():
    print("Initializing Gemini model...")
    model = GoogleGenerativeAI(
        model="gemini-1.0-pro",
        google_api_key=gemini_api_key,
        temperature=0.1,
    )
    print("Gemini model initialized successfully.")
    return model

# Function to generate embeddings
def generate_embeddings(text):
    try:
        if not text.strip():
            print(f"Skipped embedding: Text is empty or whitespace.")
            return None

        print(f"Generating embedding for text: {text[:50]}...")
        model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")
        input_obj = TextEmbeddingInput(text=text)
        embeddings = model.get_embeddings([input_obj])
        print("Embedding generated successfully.")
        return embeddings[0].values
    except Exception as e:
        print(f"Error generating embedding for text: {text[:50]}... Error: {e}")
        return None

# Function to clean Gemini response
def clean_response(response_str):
    print("Cleaning Gemini response...")
    response_str = re.sub(r'^```json\n', '', response_str)
    response_str = re.sub(r'```$', '', response_str).strip()

    try:
        response_json = json.loads(response_str)
        print("Response cleaned and converted to JSON.")
        return response_json
    except json.JSONDecodeError:
        print("Error parsing the response as JSON, response was:", response_str)
        return {"findings": [], "metadata": {}}

# Function to process text with Gemini
def chunk_and_clean_text(model, raw_text):
    prompt = """
    You are a highly capable AI model tasked with cleaning and chunking the provided text.
    Please return the response in JSON format with two keys:
    - "findings": A list of valid claims or facts related to muscle training, nutrition, gym, biology, etc.
    - "metadata": A dictionary containing the "title" key with the paper's title.
    Here is the input text: {raw_text}
    """

    print(f"Processing text chunk: {raw_text[:100]}...")
    response_str = model(prompt.format(raw_text=raw_text))
    response_json = clean_response(response_str)

    findings = response_json.get("findings", [])
    metadata = response_json.get("metadata", {})
    print(f"Processed text with findings: {len(findings)} items.")
    return {"findings": findings, "metadata": metadata}

# Function to process pages of a PDF
def process_pages(model, pages):
    full_response = {}
    print("Starting to process PDF pages...")

    for idx, page in enumerate(pages):
        print(f"Processing page {idx + 1}...")
        response = chunk_and_clean_text(model, page)

        title = response["metadata"].get("title", f"Untitled Page {idx + 1}")
        if title not in full_response:
            full_response[title] = []
        full_response[title].extend(response["findings"])
    print("All pages processed.")
    return full_response

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    print(f"Extracting text from PDF: {file_path}...")
    with open(file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        pages = [page.extract_text() for page in pdf_reader.pages]
    print(f"Extracted {len(pages)} pages from PDF.")
    return pages

# Function to process findings with embeddings
def process_findings_with_embeddings(findings_by_title):
    processed = {}
    print("Processing findings with embeddings...")

    for title, findings in findings_by_title.items():
        print(f"Processing title: {title} with {len(findings)} findings...")
        embeddings_list = []

        for finding in findings:
            embedding = generate_embeddings(finding)
            if embedding:
                embeddings_list.append({"text": finding, "embedding": embedding})
            else:
                print(f"Embedding failed for finding: '{finding}' under title: '{title}'")

        processed[title] = embeddings_list
        print(f"Processed title: {title} with {len(embeddings_list)} embeddings.")
    return processed

# Main pipeline function
def process_folder(folder_path):
    model = initialize_model()
    total_data = {"processed": {}, "completed_files": []}

    for file_name in os.listdir(folder_path):
        try:
            file_path = os.path.join(folder_path, file_name)
            if file_name in total_data["completed_files"]:
                print(f"Skipping already completed file: {file_name}")
                continue

            print(f"Processing file: {file_name}...")
            pages = extract_text_from_pdf(file_path)
            findings_by_title = process_pages(model, pages)
            embeddings_data = process_findings_with_embeddings(findings_by_title)

            total_data["processed"].update(embeddings_data)
            total_data["completed_files"].append(file_name)
            print(f"File {file_name} processed successfully.")

            # Save progress after each file
            with open("progress_with_embeddings.json", "w") as f:
                json.dump(total_data, f, indent=4)
            print("Progress saved.")
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

    return total_data

if __name__ == "__main__":
    folder_path = "nutrition_research_papers"
    print("Starting pipeline...")
    start_time = datetime.now()
    result = process_folder(folder_path)
    print(f"Pipeline completed in {datetime.now() - start_time}.")
    print("Final result:", result)


In [None]:
import os
import json
import re
import PyPDF2
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAI
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from google.cloud import aiplatform
from datetime import datetime

# Load environment variables
load_dotenv()
gemini_api_key = os.getenv("GEMINI_API_KEY")
aiplatform_project = os.getenv("PROJECT_ID")
aiplatform_location = os.getenv("LOCATION_ID")
processor_id = os.getenv("PROCESSOR_ID")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "preprocessing_credentials.json"

# Initialize Vertex AI
aiplatform.init(project=aiplatform_project, location=aiplatform_location)

# Initialize Gemini model
def initialize_model():
    print("Initializing Gemini model...")
    model = GoogleGenerativeAI(
        model="gemini-1.0-pro",
        google_api_key=gemini_api_key,
        temperature=0.1,
    )
    print("Gemini model initialized successfully.")
    return model

# Generate embeddings using Vertex AI's text embedding model
def generate_embeddings(text):
    try:
        if not text.strip():
            print(f"Skipped embedding: Text is empty or whitespace.")
            return None

        print(f"Generating embedding for text: {text[:50]}...")
        model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")
        input_obj = TextEmbeddingInput(text=text)
        embeddings = model.get_embeddings([input_obj])
        print("Embedding generated successfully.")
        return embeddings[0].values
    except Exception as e:
        print(f"Error generating embedding for text: {text[:50]}... Error: {e}")
        return None

# Clean the Gemini model's response
def clean_response(response_str):
    print("Cleaning Gemini response...")
    response_str = re.sub(r'^```json\n', '', response_str)
    response_str = re.sub(r'```$', '', response_str).strip()

    try:
        response_json = json.loads(response_str)
        print("Response cleaned and converted to JSON.")
        return response_json
    except json.JSONDecodeError:
        print("Error parsing the response as JSON, response was:", response_str)
        return {"findings": [], "metadata": {}}

# Chunk and clean text from the raw input
def chunk_and_clean_text(model, raw_text):
    prompt = """
    You are a highly capable AI model tasked with cleaning and chunking the provided text.
    Please return the response in JSON format with two keys:
    - "findings": A list of valid claims or facts related to muscle training, nutrition, gym, biology, etc.
    - "metadata": A dictionary containing the "title" key with the paper's title.
    Here is the input text: {raw_text}
    """

    print(f"Processing text chunk: {raw_text[:100]}...")
    response_str = model(prompt.format(raw_text=raw_text))
    response_json = clean_response(response_str)

    findings = response_json.get("findings", [])
    metadata = response_json.get("metadata", {})
    print(f"Processed text with findings: {len(findings)} items.")
    return {"findings": findings, "metadata": metadata}

# Process all pages of a PDF
def process_pages(model, pages):
    full_response = {}
    print("Starting to process PDF pages...")

    for idx, page in enumerate(pages):
        print(f"Processing page {idx + 1}...")
        response = chunk_and_clean_text(model, page)

        title = response["metadata"].get("title", f"Untitled Page {idx + 1}")
        if title not in full_response:
            full_response[title] = []
        full_response[title].extend(response["findings"])
    print("All pages processed.")
    return full_response

# Extract text from a PDF file
def extract_text_from_pdf(file_path):
    print(f"Extracting text from PDF: {file_path}...")
    with open(file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        pages = [page.extract_text() for page in pdf_reader.pages]
    print(f"Extracted {len(pages)} pages from PDF.")
    return pages

# Process findings with embeddings
def process_findings_with_embeddings(findings_by_title):
    processed = {}
    print("Processing findings with embeddings...")

    for title, findings in findings_by_title.items():
        print(f"Processing title: {title} with {len(findings)} findings...")
        embeddings_list = []

        for finding in findings:
            embedding = generate_embeddings(finding)
            if embedding:
                embeddings_list.append({"text": finding, "embedding": embedding})
            else:
                print(f"Embedding failed for finding: '{finding}' under title: '{title}'")

        processed[title] = embeddings_list
        print(f"Processed title: {title} with {len(embeddings_list)} embeddings.")
    return processed

# Main function to process all PDFs in a folder
def process_folder(folder_path):
    model = initialize_model()
    total_data = {"processed": {}, "completed_files": []}

    for file_name in os.listdir(folder_path):
        try:
            file_path = os.path.join(folder_path, file_name)
            if file_name in total_data["completed_files"]:
                print(f"Skipping already completed file: {file_name}")
                continue

            print(f"Processing file: {file_name}...")
            pages = extract_text_from_pdf(file_path)
            findings_by_title = process_pages(model, pages)
            embeddings_data = process_findings_with_embeddings(findings_by_title)

            total_data["processed"].update(embeddings_data)
            total_data["completed_files"].append(file_name)
            print(f"File {file_name} processed successfully.")

            # Save progress after each file
            with open("progress_with_embeddings.json", "w") as f:
                json.dump(total_data, f, indent=4)
            print("Progress saved.")
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

    return total_data

if __name__ == "__main__":
    folder_path = "nutrition_research_papers"  # Replace with your folder path
    print("Starting pipeline...")
    start_time = datetime.now()
    result = process_folder(folder_path)
    print(f"Pipeline completed in {datetime.now() - start_time}.")
    print("Final result:", result)
