In [None]:
import os
from google.cloud import documentai_v1 as documentai
from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel

# Set your environment variables for authentication
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "preprocessing_credentials.json"

# Document AI: Extract Text from PDF
def extract_text_from_pdf(project_id, location, processor_id, file_path):
    client = documentai.DocumentProcessorServiceClient()

    # Read the file
    with open(file_path, "rb") as file:
        content = file.read()

    # Configure the request
    document = documentai.RawDocument(content=content, mime_type="application/pdf")
    name = f"projects/{project_id}/locations/us/processors/{processor_id}"
    request = {"name": name, "raw_document": document}

    # Process the document
    response = client.process_document(request=request)
    document_text = response.document.text

    return document_text

# Vertex AI: Semantic Chunking with Gemini
def perform_semantic_chunking_with_gemini(project_id, location, text_input):
    # Initialize Vertex AI
    aiplatform.init(project=project_id, location=location)

    # Load Gemini model
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko")

    # Generate embeddings for semantic chunking
    embeddings = model.get_embeddings([text_input])

    # Dummy implementation for chunking (semantic embeddings clustering)
    # For demonstration: Split text into smaller chunks by semantic similarity.
    # Replace this with advanced clustering logic as per your use case.
    chunk_size = 500  # Characters per chunk
    chunks = [text_input[i:i+chunk_size] for i in range(0, len(text_input), chunk_size)]

    return chunks

# Main Workflow
def main():
    # GCP Configuration
    project_id = "athlyze-446917"
    location = "us-central1"
    processor_id = "a370be5d003f980f"
    file_path = "resistant_research_papers/2102.00836v2.pdf"

    # Step 1: Extract text using Document AI
    print("Extracting text from PDF using Document AI...")
    extracted_text = extract_text_from_pdf(project_id, location, processor_id, file_path)
    print(extracted_text)
    # Step 2: Perform semantic chunking with Gemini
    print("Performing semantic chunking with Vertex AI...")
    semantic_chunks = perform_semantic_chunking_with_gemini(project_id, location, extracted_text)

    # Step 3: Print the semantic chunks
    print("\nSemantic Chunks:")
    for i, chunk in enumerate(semantic_chunks, 1):
        print(f"Chunk {i}:\n{chunk}\n")

if __name__ == "__main__":
    main()


More refined approach:

In [36]:
import os
from google.cloud import documentai_v1 as documentai
from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel
from vertexai.language_models import ChatModel
from langchain_google_genai import GoogleGenerativeAI

# Set your environment variables for authentication
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "preprocessing_credentials.json"

# Document AI: Extract Text from PDF
def extract_text_from_pdf(project_id, location, processor_id, file_path):
    client = documentai.DocumentProcessorServiceClient()

    # Read the file
    with open(file_path, "rb") as file:
        content = file.read()

    # Configure the request
    document = documentai.RawDocument(content=content, mime_type="application/pdf")
    name = f"projects/{project_id}/locations/us/processors/{processor_id}"
    request = {"name": name, "raw_document": document}

    # Process the document
    response = client.process_document(request=request)
    document_text = response.document.text

    return document_text

# Vertex AI: Semantic Chunking with Gemini (Refinement Step)
def perform_semantic_chunking_with_gemini(project_id, location, text_input):
    # Initialize Vertex AI
    aiplatform.init(project=project_id, location=location)

    # Load Gemini model
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko")

    # Generate embeddings for semantic chunking
    embeddings = model.get_embeddings([text_input])
                
    # Refine chunking process based on more detailed analysis
    refined_chunks = refine_chunks_with_gemini(text_input)

    return refined_chunks

# Refining chunks with detailed prompt
def refine_chunks_with_gemini(text_input):
    prompt = f"""
    You are an AI model tasked with analyzing research papers on scientific nutrition and muscle building.
    1. Chunk the provided text into semantically meaningful sections.
    2. Group similar ideas together and ensure the chunks are coherent.
    3. Focus on extracting relevant information about:
        - Protein intake and muscle growth
        - Carbohydrate and fat roles
        - Optimal nutrient timing
        - Exercise strategies
        - Supplements for muscle building
        - Recovery techniques
        - Demographic-specific insights
    Ignore unrelated or generic content.
    Input Text: {text_input}
    Output Format:
    - Chunk 1: <Chunk text>
    - Chunk 2: <Chunk text>
    - ...
    """
    # Initialize GEMINI model
    model = GoogleGenerativeAI(
        model="gemini-1.0-pro", 
        google_api_key=os.getenv("GOOGLE_API_KEY"),
        temperature=0.1  # Adjust temperature for response variability
    )
    model = GoogleGenerativeAI(
        model="gemini-1.0-pro",  # Specify GEMINI model
        google_api_key=os.getenv("GOOGLE_API_KEY"),
        temperature=0.1  # Adjust temperature for response variability
    )

    # Generate the response using the model
    response = model.invoke(prompt)  # Call the model with the prompt
    

    return response

# GCP Configuration
project_id = "athlyze-446917"
location = "us-central1"
processor_id = "a370be5d003f980f"
file_path = "resistant_research_papers/2102.00836v2.pdf"

In [None]:
# Step 1: Extract text using Document AI
print("Extracting text from PDF using Document AI...")
extracted_text = extract_text_from_pdf(project_id, location, processor_id, file_path)
print(extracted_text)

In [None]:
model = GoogleGenerativeAI(
        model="gemini-1.0-pro", 
        google_api_key=os.getenv("GOOGLE_API_KEY"),
        temperature=0.1  # Adjust temperature for response variability
    )

help(model)

In [38]:
# Step 2: Perform semantic chunking with Gemini
print("Performing semantic chunking with Vertex AI...")
refined_chunks = perform_semantic_chunking_with_gemini(project_id, location, extracted_text)

# Step 3: Print the refined semantic chunks
print("\nRefined Semantic Chunks:", refined_chunks)

Performing semantic chunking with Vertex AI...

Refined Semantic Chunks: ['- Chunk 1: Muscles sense internally generated and externally applied forces, responding to these in a coordinated hier-', 'archical manner at different timescales.', '- Chunk 2: At the fastest timescales (tens or hundreds of milliseconds), skeletal muscles can produce near maximal force for', 'jumping or for the fight-or-flight response. Most muscles also go through cycles of shortening and lengthening with a period', 'of the order of a second in the vast majority of sprint or endurance exercise (running, climbing, etc.) At a much longer', 'timescale of many days, a muscle must also be able to measure changes in its overall use in order to effect adaptive mus-', 'cle hypertrophy/atrophy - ultimately helping to prevent injury on the scale of months and years.', '- Chunk 3: How the muscle cell keeps track of the history of its load and stress inputs within a number of intracellular output sig-', 'nals (which then 

Agentic Segmentation:

In [None]:
import os
from langchain_google_genai import GoogleGenerativeAI

def chunk_and_clean_text(raw_text):
    prompt = """
    You are a highly capable AI model tasked with cleaning and chunking the provided text.
    Refer to this Input Text: {raw_text}
    Based on the provided text, please remove all the references, reference number such as "[1, 3]", peoples name, and any other irrelevant information. Then, chunk the text into meaningful segments. Each segment should be a coherent piece of information that can be understood independently. Ensure that the chunks are logically organized and provide a clear overview of the content.
    Return a response as python dictionary "response" containing: "findings" which should contain all the proven claims/ facts in this paper pertaining muscle training, nutrition, gym, biology etc, "metadata" which should contain the title of the paper.
    """

    model = GoogleGenerativeAI(
        model="gemini-1.0-pro",
        google_api_key=os.getenv("GOOGLE_API_KEY"),
        temperature=0.1
    )

    response = model(prompt.format(raw_text=raw_text))

    print(response)



# Perform text cleaning and chunking
print("Cleaning and chunking text...")
refined_chunks = chunk_and_clean_text(extracted_text)

Cleaning and chunking text...
```python
response = {
    "findings": [
        "Muscles sense internally generated and externally applied forces, responding to these in a coordinated hier-archical manner at different timescales.",
        "At the fastest time scales (tens or hundreds of milliseconds), skeletal muscles can produce near maximal force for jumping or for the fight-or-flight response.",
        "Most muscles also go through cycles of shortening and lengthening with a period of the order of a second in the vast majority of sprint or endurance exercise (running, climbing, etc.)",
        "How the muscle cell keeps track of the history of its load and stress inputs within a number of intracellular output signals (which then go on to stimulate or inhibit muscle protein synthesis), is inherently an incredibly complex biochemical question.",
        "Cells sense and respond to the mechanical properties of their environment using two main classes of force receptors.",
        "The

In [42]:
print(extracted_text)

arXiv:2102.00836v2 [q-bio.TO] 5 May 2021
Why exercise builds muscles: Titin mechanosensing controls skeletal muscle growth under load
Neil Ibata and Eugene M. Terentjev*
(Dated: May 6, 2021)
Muscles sense internally generated and externally applied forces, responding to these in a coordinated hier-
archical manner at different time scales. The center of the basic unit of the muscle, the sarcomeric M-band, is
perfectly placed to sense the different types of load to which the muscle is subjected. In particular, the kinase
domain (TK) of titin located at the M-band is a known candidate for mechanical signaling. Here, we develop
the quantitative mathematical model that describes the kinetics of TK-based mechanosensitive signaling, and
predicts trophic changes in response to exercise and rehabilitation regimes. First, we build the kinetic model
for TK conformational changes under force: opening, phosphorylation, signaling and autoinhibition. We find
that TK opens as a metastable mechanosens

In [53]:
import os
from langchain_google_genai import GoogleGenerativeAI
from google.cloud import documentai_v1 as documentai
from google.cloud import aiplatform
import sys

# Set your environment variables for authentication
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "preprocessing_credentials.json"

# Initialize the AI model
def initialize_model():
    return GoogleGenerativeAI(
        model="gemini-1.0-pro",
        google_api_key=os.getenv("GOOGLE_API_KEY"),
        temperature=0.1
    )

# Function to clean and chunk the text
def chunk_and_clean_text(model, raw_text):
    prompt = """
    You are a highly capable AI model tasked with cleaning and chunking the provided text.
    Refer to this Input Text: {raw_text}
    Based on the provided text, please remove all the references, reference numbers such as "[1, 3]", people's names, and any other irrelevant information. 
    Then, chunk the text into meaningful segments. Each segment should be a coherent piece of information that can be understood independently.
    Ensure that the chunks are logically organized and provide a clear overview of the content.
    Return a response as a Python dictionary "response" containing:
    - "findings": which should contain all the proven claims/facts in this paper pertaining to muscle training, nutrition, gym, biology, etc.
    - "metadata": which should contain the title of the paper.
    """
    
    # Send the request to the model
    response = model(prompt.format(raw_text=raw_text))
    
    return response

def extract_text_from_pdf(project_id, location, processor_id, file_path):
    client = documentai.DocumentProcessorServiceClient()

    with open(file_path, "rb") as file:
        content = file.read()

    document = documentai.RawDocument(content=content, mime_type="application/pdf")
    name = f"projects/{project_id}/locations/us/processors/{processor_id}"
    request = {"name": name, "raw_document": document}

    response = client.process_document(request=request)
    document_text = response.document.text
    
    # Loop through all pages in the response
    pages = []
    for page in response.document.pages:
        page_text = ""
        # Loop through all blocks in the page
        for block in page.blocks:
            print(help(page.blocks))
            sys.exit()
            page_text += block.text + "\n"  # Add new line after each block
        pages.append(page_text.strip())  # Remove trailing newline
    sys.exit()
    return pages

# GCP Configuration
project_id = "athlyze-446917"
location = "us-central1"
processor_id = "a370be5d003f980f"
file_path = "resistant_research_papers/2102.00836v2.pdf"

# Step 1: Extract text using Document AI (split into pages)
print("Extracting text from PDF using Document AI...")
pages = extract_text_from_pdf(project_id, location, processor_id, file_path)

# Step 2: Process each page one by one and accumulate results
def process_pages(pages):
    model = initialize_model()
    full_response = {"findings": [], "metadata": {}}
    
    for page in pages:
        print(f"Processing page {pages.index(page) + 1}...")
        response = chunk_and_clean_text(model, page)
        
        # Append findings from the current page to the full response
        full_response["findings"].extend(response.get("findings", []))
        
        # Metadata can be taken from the first page or as provided
        if not full_response["metadata"]:
            full_response["metadata"] = response.get("metadata", {})
    
    return full_response

# Step 3: Perform text cleaning and chunking on the pages
print("Cleaning and chunking text from each page...")
final_response = process_pages(pages)

# Output the final response
print(final_response)


Extracting text from PDF using Document AI...
Help on RepeatedComposite in module proto.marshal.collections.repeated object:

class RepeatedComposite(Repeated)
 |  RepeatedComposite(sequence, *, marshal, proto_type=None)
 |  
 |  A view around a mutable sequence of messages in protocol buffers.
 |  
 |  This implements the full Python MutableSequence interface, but all methods
 |  modify the underlying field container directly.
 |  
 |  Method resolution order:
 |      RepeatedComposite
 |      Repeated
 |      collections.abc.MutableSequence
 |      collections.abc.Sequence
 |      collections.abc.Reversible
 |      collections.abc.Collection
 |      collections.abc.Sized
 |      collections.abc.Iterable
 |      collections.abc.Container
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __eq__(self, other)
 |      Return self==value.
 |  
 |  __getitem__(self, key)
 |      Return the given item.
 |  
 |  __setitem__(self, key, value)
 |  
 |  insert(self, index: int, val

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [67]:
import os
import json
import re
import PyPDF2
from langchain_google_genai import GoogleGenerativeAI

def initialize_model():
    model = GoogleGenerativeAI(
        model="gemini-1.0-pro",
        google_api_key=os.getenv("GOOGLE_API_KEY"),
        temperature=0.1
    )
    return model

def clean_response(response_str):
    """
    Cleans the response string by removing the code block markers and then attempts to convert it to JSON.
    """
    # Remove the code block markers (start and end)
    response_str = re.sub(r'^```json\n', '', response_str)
    response_str = re.sub(r'```$', '', response_str).strip()
    
    # Print the cleaned response string for debugging purposes
    print("Cleaned Response String:", response_str)
    
    # Attempt to parse the cleaned string into a JSON object
    try:
        response_json = json.loads(response_str)
        return response_json
    except json.JSONDecodeError:
        print("Error parsing the response as JSON, response was:", response_str)
        return {"findings": [], "metadata": {}}

def chunk_and_clean_text(model, raw_text):
    prompt = """
    You are a highly capable AI model tasked with cleaning and chunking the provided text.
    Please return the response in JSON format with two keys:
    - "findings": A list of valid claims or facts related to muscle training, nutrition, gym, biology, etc.
    - "metadata": A dictionary containing the "title" key with the paper's title.
    Here is the input text: {raw_text}
    """

    # Get response from Gemini model (in string format)
    response_str = model(prompt.format(raw_text=raw_text))
    
    # Print the raw response string for debugging
    print("Raw Response:", response_str)
    
    # Clean and parse the response string into a JSON object
    response_json = clean_response(response_str)
    
    findings = response_json.get("findings", [])
    metadata = response_json.get("metadata", {})
    
    return {"findings": findings, "metadata": metadata}

def process_pages(pages):
    model = initialize_model()
    full_response = {"findings": [], "metadata": {}}
    
    for page in pages:
        print(f"Processing page {pages.index(page) + 1}...")
        response = chunk_and_clean_text(model, page)
        
        # Print the response for debugging purposes
        print("Response:", response)  # Print the response to verify it's in the correct format
        
        # Merge findings from the response
        if isinstance(response, dict):
            # Append findings to full_response['findings']
            full_response["findings"].extend(response.get("findings", []))
            
            # Merge metadata if it's not already set
            if not full_response["metadata"]:
                full_response["metadata"] = response.get("metadata", {})
        else:
            print("Response is not in the expected format:", response)
    
    return full_response

# Extract text from PDF using PyPDF2
def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        pages = []
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            pages.append(page_text)
    return pages


# Path to your PDF file
file_path = "resistant_research_papers/2102.00836v2.pdf"
# Extract text from the PDF using PyPDF2
print("Extracting text from PDF using PyPDF2...")
pages = extract_text_from_pdf(file_path)
print(len(pages))
# Perform chunking and cleaning
print("Cleaning and chunking text from each page...")
final_response = process_pages(pages)

# Output the final response
print(final_response)


Extracting text from PDF using PyPDF2...
14
Cleaning and chunking text from each page...
Processing page 1...
Raw Response: ```json
{
  "findings": [
    "Muscles sense internally generated and externally applied forces, responding to these in a coordinated hierarchical manner at different time scales.",
    "The center of the basic unit of the muscle, the sarcomeric M-band, is perfectly placed to sense the different types of load to which the muscle is subjected.",
    "The kinase domain (TK) of titin located at the M-band is a known candidate for mechanical signaling.",
    "TK opens as a metastable mechanosensitive switch, which naturally produces a much greater signal after high-load resistance exercise than an equally energetically costly endurance effort.",
    "The full model yields a steady-state solution (homeostasis) for muscle cross-sectional area and tension, and a quantitatively plausible hypertrophic response to training as well as atrophy following an extended reduction 

In [68]:
final_response

{'findings': ['Muscles sense internally generated and externally applied forces, responding to these in a coordinated hierarchical manner at different time scales.',
  'The center of the basic unit of the muscle, the sarcomeric M-band, is perfectly placed to sense the different types of load to which the muscle is subjected.',
  'The kinase domain (TK) of titin located at the M-band is a known candidate for mechanical signaling.',
  'TK opens as a metastable mechanosensitive switch, which naturally produces a much greater signal after high-load resistance exercise than an equally energetically costly endurance effort.',
  'The full model yields a steady-state solution (homeostasis) for muscle cross-sectional area and tension, and a quantitatively plausible hypertrophic response to training as well as atrophy following an extended reduction in tension.',
  'The overall muscle is characterised by its cross-section area (CSA), which contains a certain number ( Nc) of mucle ﬁbers (the musc