In [64]:
import os
import json

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= 'preprocessing_credentials.json'

In [21]:
from google.cloud import documentai_v1 as documentai

def extract_text_from_pdf(file_path, project_id, location="us", processor_id="a370be5d003f980f"):
    # Initialize the Document AI client
    client = documentai.DocumentProcessorServiceClient()

    # Specify the processor name (replace with your actual processor ID)
    processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    # Read the PDF file
    with open(file_path, "rb") as pdf_file:
        pdf_content = pdf_file.read()

    # Create the raw document request
    raw_document = documentai.RawDocument(content=pdf_content)

    # Create the process request
    request = documentai.ProcessRequest(
        name=processor_name, raw_document=raw_document
    )

    # Process the document
    result = client.process_document(request=request)

    # Extract and return the text
    document = result.document
    text = document.text
    return text


In [22]:
from google.cloud import language_v1

def filter_relevant_content(text, categories_to_keep):
    client = language_v1.LanguageServiceClient()

    document = language_v1.Document(content=text, type_=language_v1.Document.Type.PLAIN_TEXT)
    response = client.classify_text(document=document)

    filtered_text = []
    for category in response.categories:
        if any(cat in category.name for cat in categories_to_keep):
            filtered_text.append(text)

    return " ".join(filtered_text)


In [23]:
from langchain.chains import load_summarize_chain
from langchain.docstore.document import Document

def agentic_chunking(text, embedding_model):
    # Split text into smaller parts for processing
    doc = Document(page_content=text)

    # Use a summarization chain to group text into chunks
    summarize_chain = load_summarize_chain(embedding_model)
    output = summarize_chain.run([doc])

    return output


In [24]:
from vertexai.language_models import TextEmbedding

def generate_embeddings(text):
    embedding_model = TextEmbedding.from_pretrained("textembedding-gecko")
    return embedding_model.embed(text)


In [25]:
import json

def save_chunks_to_file(chunks, file_path):
    with open(file_path, "w") as f:
        json.dump(chunks, f)


In [26]:
def process_research_paper(file_path, project_id, categories_to_keep, output_file):
    # Extract text from PDF
    text = extract_text_from_pdf(file_path, project_id)

    # Filter useful content
    filtered_text = filter_relevant_content(text, categories_to_keep)

    # Perform agentic chunking
    chunks = agentic_chunking(filtered_text, embedding_model="gemini-text-embedding")

    # Generate embeddings
    chunk_data = [{"chunk": chunk, "embedding": generate_embeddings(chunk)} for chunk in chunks]

    # Save chunks to file
    save_chunks_to_file(chunk_data, output_file)

    print(f"Processing complete. Chunks saved to {output_file}")


In [27]:
process_research_paper(
    "nutrition_research_papers/nutrients-11-01136.pdf", 
    "athlyze-446917", 
    ["Health & Fitness", "Nutrition", "Sports Science", "Physiology", "Medical Sciences"], 
    "test_research_chunks.json"
)


InvalidArgument: 400 Request contains an invalid argument. [field_violations {
  field: "raw_document.mime_type"
  description: "Unsupported mime type."
}
]

In [40]:
import os
import json
from google.cloud import documentai_v1 as documentai
from langchain.chains import load_summarize_chain
from langchain.docstore.document import Document
from vertexai.language_models import TextEmbedding

# Set the environment variable for GCP credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'preprocessing_credentials.json'

def extract_text_from_pdf(file_path, project_id, location="us", processor_id="a370be5d003f980f"):
    # Initialize the Document AI client
    client = documentai.DocumentProcessorServiceClient()

    # Specify the processor name (replace with your actual processor ID)
    processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    # Read the PDF file
    with open(file_path, "rb") as pdf_file:
        pdf_content = pdf_file.read()

    # Create the raw document request
    raw_document = documentai.RawDocument(content=pdf_content)

    # Create the process request
    request = documentai.ProcessRequest(
        name=processor_name, raw_document=raw_document
    )

    # Process the document
    result = client.process_document(request=request)

    # Extract and return the text
    document = result.document
    text = document.text
    return text

def split_document_into_chunks(file_path, max_pages=15):
    # This function splits the PDF into smaller chunks based on max_pages
    from PyPDF2 import PdfReader

    # Read the PDF document
    reader = PdfReader(file_path)
    num_pages = len(reader.pages)

    chunks = []
    for start_page in range(0, num_pages, max_pages):
        end_page = min(start_page + max_pages, num_pages)
        chunk = ""
        
        # Combine text from the pages in the current chunk
        for page_num in range(start_page, end_page):
            page = reader.pages[page_num]
            chunk += page.extract_text()
        
        chunks.append(chunk)
    
    return chunks

def agentic_chunking(text, embedding_model):
    # Split text into smaller parts for processing
    doc = Document(page_content=text)

    # Use a summarization chain to group text into chunks
    summarize_chain = load_summarize_chain(embedding_model)
    output = summarize_chain.run([doc])

    return output

def generate_embeddings(text):
    # Generate embeddings using the Gemini model
    embedding_model = TextEmbedding.from_pretrained("textembedding-gecko")
    return embedding_model.embed(text)

def save_chunks_to_file(chunks, file_path):
    # Save the chunks and embeddings to a JSON file
    with open(file_path, "w") as f:
        json.dump(chunks, f)

def filter_relevant_content(text, categories_to_keep):
    # Filter the text based on the categories provided
    # You can implement a more specific filtering method here if needed
    filtered_text = "\n".join([line for line in text.splitlines() if any(category in line for category in categories_to_keep)])
    return filtered_text

def process_research_paper(file_path, project_id, categories_to_keep, output_file, max_pages=15):
    # Split the document into smaller chunks if it's too large
    chunks = split_document_into_chunks(file_path, max_pages)
    
    # Combine extracted text from all chunks
    combined_text = ""
    for chunk in chunks:
        # Filter useful content based on categories
        filtered_text = filter_relevant_content(chunk, categories_to_keep)

        # Combine the filtered text
        combined_text += filtered_text

    # Perform agentic chunking
    chunked_text = agentic_chunking(combined_text, embedding_model="gemini-text-embedding")

    # Generate embeddings for each chunk
    chunk_data = [{"chunk": chunk, "embedding": generate_embeddings(chunk)} for chunk in chunked_text]

    # Save chunks and embeddings to a file
    save_chunks_to_file(chunk_data, output_file)

    print(f"Processing complete. Chunks saved to {output_file}")

# Example usage
process_research_paper(
    "nutrition_research_papers/nutrients-11-01136.pdf", 
    "athlyze-446917", 
    ["Health & Fitness", "Nutrition", "Sports Science", "Physiology", "Medical Sciences"], 
    "test_research_chunks.json"
)


ImportError: cannot import name 'DocumentProcessorServiceClient' from 'google.cloud.aiplatform.gapic' (/opt/anaconda3/envs/athlyze/lib/python3.10/site-packages/google/cloud/aiplatform/gapic/__init__.py)

In [69]:
import os
import json
from google.cloud import aiplatform
from google.cloud import documentai_v1 as documentai

# Set up the API credentials (ensure your Google Cloud credentials are set)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= 'preprocessing_credentials.json'
aiplatform.init(project="athlyze-446917", location="us-central1")  # Replace with your project ID and region

# Function to extract text from PDF using Document AI
def extract_text_from_pdf(file_path, project_id, location="us", processor_id="a370be5d003f980f"):
    client = documentai.DocumentProcessorServiceClient()
    processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    with open(file_path, "rb") as pdf_file:
        pdf_content = pdf_file.read()

    raw_document = documentai.RawDocument(
        content=pdf_content,
        mime_type="application/pdf"
    )

    request = documentai.ProcessRequest(
        name=processor_name, raw_document=raw_document
    )

    result = client.process_document(request=request)
    document = result.document
    return document.text

# Function to filter relevant content
def filter_relevant_content(text, categories_to_keep):
    """Filter the content based on categories (customize as needed)."""
    return text if any(cat in text for cat in categories_to_keep) else ""

# Function to generate embeddings using Vertex AI
def generate_embeddings(text):
    """Generate embeddings using Vertex AI."""
    # Replace with your actual model and endpoint ID for embeddings
    endpoint = aiplatform.Endpoint("projects/athlyze-446917/locations/us-central1/endpoints/embedding-endpoint-id")
    response = endpoint.predict(instances=[{"content": text}])
    return response.predictions

# Function to summarize text using Vertex AI
def summarize_text(text):
    """Summarize the text using Vertex AI."""
    # Replace with your actual model and endpoint ID for summarization
    endpoint = aiplatform.Endpoint("projects/athlyze-446917/locations/us-central1/endpoints/text-bison-endpoint-id")
    response = endpoint.predict(instances=[{"content": text}])
    return response.predictions[0]['summary']

# Function to save the chunks with embeddings to a file
def save_chunks_to_file(chunks, file_path):
    """Save chunked data with embeddings to a JSON file."""
    with open(file_path, "w") as f:
        json.dump(chunks, f)

# Main function to process research paper and generate vector database
def process_research_paper(file_path, project_id, categories_to_keep, output_file):
    """Process a research paper, clean, chunk, summarize, and generate embeddings."""
    # Extract text from the paper
    text = extract_text_from_pdf(file_path, project_id)

    # Filter and clean text based on categories
    filtered_text = filter_relevant_content(text, categories_to_keep)

    # Summarize the filtered text
    summarized_text = summarize_text(filtered_text)

    # Generate embeddings for the summarized text
    embeddings = generate_embeddings(summarized_text)

    # Create chunks with embeddings
    chunk_data = [{"chunk": summarized_text, "embedding": embeddings}]

    # Save chunks with embeddings to file
    save_chunks_to_file(chunk_data, output_file)

    print(f"Processing complete. Chunks saved to {output_file}")

# Example usage
process_research_paper(
    "resistant_research_papers/2102.00836v2.pdf", 
    "athlyze-446917", 
    ["Health & Fitness", "Nutrition", "Sports Science", "Physiology", "Medical Sciences"], 
    "test_research_chunks.json"
)

NotFound: 404 Endpoint `projects/athlyze-446917/locations/us-central1/endpoints/text-bison-endpoint-id` not found.