In [5]:
import os
from google.cloud import documentai_v1 as documentai
from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel

# Set your environment variables for authentication
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "preprocessing_credentials.json"

# Document AI: Extract Text from PDF
def extract_text_from_pdf(project_id, location, processor_id, file_path):
    client = documentai.DocumentProcessorServiceClient()

    # Read the file
    with open(file_path, "rb") as file:
        content = file.read()

    # Configure the request
    document = documentai.RawDocument(content=content, mime_type="application/pdf")
    name = f"projects/{project_id}/locations/us/processors/{processor_id}"
    request = {"name": name, "raw_document": document}åß

    # Process the document
    response = client.process_document(request=request)
    document_text = response.document.text

    return document_text

# Vertex AI: Semantic Chunking with Gemini
def perform_semantic_chunking_with_gemini(project_id, location, text_input):
    # Initialize Vertex AI
    aiplatform.init(project=project_id, location=location)

    # Load Gemini model
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko")

    # Generate embeddings for semantic chunking
    embeddings = model.get_embeddings([text_input])

    # Dummy implementation for chunking (semantic embeddings clustering)
    # For demonstration: Split text into smaller chunks by semantic similarity.
    # Replace this with advanced clustering logic as per your use case.
    chunk_size = 500  # Characters per chunk
    chunks = [text_input[i:i+chunk_size] for i in range(0, len(text_input), chunk_size)]

    return chunks

# Main Workflow
def main():
    # GCP Configuration
    project_id = "athlyze-446917"
    location = "us-central1"
    processor_id = "a370be5d003f980f"
    file_path = "resistant_research_papers/2102.00836v2.pdf"

    # Step 1: Extract text using Document AI
    print("Extracting text from PDF using Document AI...")
    extracted_text = extract_text_from_pdf(project_id, location, processor_id, file_path)
    print(extracted_text)
    # Step 2: Perform semantic chunking with Gemini
    print("Performing semantic chunking with Vertex AI...")
    semantic_chunks = perform_semantic_chunking_with_gemini(project_id, location, extracted_text)

    # Step 3: Print the semantic chunks
    print("\nSemantic Chunks:")
    for i, chunk in enumerate(semantic_chunks, 1):
        print(f"Chunk {i}:\n{chunk}\n")

if __name__ == "__main__":
    main()


Extracting text from PDF using Document AI...
arXiv:2102.00836v2 [q-bio.TO] 5 May 2021
Why exercise builds muscles: Titin mechanosensing controls skeletal muscle growth under load
Neil Ibata and Eugene M. Terentjev*
(Dated: May 6, 2021)
Muscles sense internally generated and externally applied forces, responding to these in a coordinated hier-
archical manner at different time scales. The center of the basic unit of the muscle, the sarcomeric M-band, is
perfectly placed to sense the different types of load to which the muscle is subjected. In particular, the kinase
domain (TK) of titin located at the M-band is a known candidate for mechanical signaling. Here, we develop
the quantitative mathematical model that describes the kinetics of TK-based mechanosensitive signaling, and
predicts trophic changes in response to exercise and rehabilitation regimes. First, we build the kinetic model
for TK conformational changes under force: opening, phosphorylation, signaling and autoinhibition. We 