In [1]:
import os
import json

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= 'preprocessing_credentials.json'

In [21]:
from google.cloud import documentai_v1 as documentai

def extract_text_from_pdf(file_path, project_id, location="us", processor_id="a370be5d003f980f"):
    # Initialize the Document AI client
    client = documentai.DocumentProcessorServiceClient()

    # Specify the processor name (replace with your actual processor ID)
    processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    # Read the PDF file
    with open(file_path, "rb") as pdf_file:
        pdf_content = pdf_file.read()

    # Create the raw document request
    raw_document = documentai.RawDocument(content=pdf_content)

    # Create the process request
    request = documentai.ProcessRequest(
        name=processor_name, raw_document=raw_document
    )

    # Process the document
    result = client.process_document(request=request)

    # Extract and return the text
    document = result.document
    text = document.text
    return text


In [22]:
from google.cloud import language_v1

def filter_relevant_content(text, categories_to_keep):
    client = language_v1.LanguageServiceClient()

    document = language_v1.Document(content=text, type_=language_v1.Document.Type.PLAIN_TEXT)
    response = client.classify_text(document=document)

    filtered_text = []
    for category in response.categories:
        if any(cat in category.name for cat in categories_to_keep):
            filtered_text.append(text)

    return " ".join(filtered_text)


In [23]:
from langchain.chains import load_summarize_chain
from langchain.docstore.document import Document

def agentic_chunking(text, embedding_model):
    # Split text into smaller parts for processing
    doc = Document(page_content=text)

    # Use a summarization chain to group text into chunks
    summarize_chain = load_summarize_chain(embedding_model)
    output = summarize_chain.run([doc])

    return output


In [24]:
from vertexai.language_models import TextEmbedding

def generate_embeddings(text):
    embedding_model = TextEmbedding.from_pretrained("textembedding-gecko")
    return embedding_model.embed(text)


In [25]:
import json

def save_chunks_to_file(chunks, file_path):
    with open(file_path, "w") as f:
        json.dump(chunks, f)


In [26]:
def process_research_paper(file_path, project_id, categories_to_keep, output_file):
    # Extract text from PDF
    text = extract_text_from_pdf(file_path, project_id)

    # Filter useful content
    filtered_text = filter_relevant_content(text, categories_to_keep)

    # Perform agentic chunking
    chunks = agentic_chunking(filtered_text, embedding_model="gemini-text-embedding")

    # Generate embeddings
    chunk_data = [{"chunk": chunk, "embedding": generate_embeddings(chunk)} for chunk in chunks]

    # Save chunks to file
    save_chunks_to_file(chunk_data, output_file)

    print(f"Processing complete. Chunks saved to {output_file}")


In [None]:
process_research_paper(
    "nutrition_research_papers/nutrients-11-01136.pdf", 
    "athlyze-446917", 
    ["Health & Fitness", "Nutrition", "Sports Science", "Physiology", "Medical Sciences"], 
    "test_research_chunks.json"
)


In [None]:
import os
import json
from google.cloud import documentai_v1 as documentai
from langchain.chains import load_summarize_chain
from langchain.docstore.document import Document
from vertexai.language_models import TextEmbedding

# Set the environment variable for GCP credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'preprocessing_credentials.json'

def extract_text_from_pdf(file_path, project_id, location="us", processor_id="a370be5d003f980f"):
    # Initialize the Document AI client
    client = documentai.DocumentProcessorServiceClient()

    # Specify the processor name (replace with your actual processor ID)
    processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    # Read the PDF file
    with open(file_path, "rb") as pdf_file:
        pdf_content = pdf_file.read()

    # Create the raw document request
    raw_document = documentai.RawDocument(content=pdf_content)

    # Create the process request
    request = documentai.ProcessRequest(
        name=processor_name, raw_document=raw_document
    )

    # Process the document
    result = client.process_document(request=request)

    # Extract and return the text
    document = result.document
    text = document.text
    return text

def split_document_into_chunks(file_path, max_pages=15):
    # This function splits the PDF into smaller chunks based on max_pages
    from PyPDF2 import PdfReader

    # Read the PDF document
    reader = PdfReader(file_path)
    num_pages = len(reader.pages)

    chunks = []
    for start_page in range(0, num_pages, max_pages):
        end_page = min(start_page + max_pages, num_pages)
        chunk = ""
        
        # Combine text from the pages in the current chunk
        for page_num in range(start_page, end_page):
            page = reader.pages[page_num]
            chunk += page.extract_text()
        
        chunks.append(chunk)
    
    return chunks

def agentic_chunking(text, embedding_model):
    # Split text into smaller parts for processing
    doc = Document(page_content=text)

    # Use a summarization chain to group text into chunks
    summarize_chain = load_summarize_chain(embedding_model)
    output = summarize_chain.run([doc])

    return output

def generate_embeddings(text):
    # Generate embeddings using the Gemini model
    embedding_model = TextEmbedding.from_pretrained("textembedding-gecko")
    return embedding_model.embed(text)

def save_chunks_to_file(chunks, file_path):
    # Save the chunks and embeddings to a JSON file
    with open(file_path, "w") as f:
        json.dump(chunks, f)

def filter_relevant_content(text, categories_to_keep):
    # Filter the text based on the categories provided
    # You can implement a more specific filtering method here if needed
    filtered_text = "\n".join([line for line in text.splitlines() if any(category in line for category in categories_to_keep)])
    return filtered_text

def process_research_paper(file_path, project_id, categories_to_keep, output_file, max_pages=15):
    # Split the document into smaller chunks if it's too large
    chunks = split_document_into_chunks(file_path, max_pages)
    
    # Combine extracted text from all chunks
    combined_text = ""
    for chunk in chunks:
        # Filter useful content based on categories
        filtered_text = filter_relevant_content(chunk, categories_to_keep)

        # Combine the filtered text
        combined_text += filtered_text

    # Perform agentic chunking
    chunked_text = agentic_chunking(combined_text, embedding_model="gemini-text-embedding")

    # Generate embeddings for each chunk
    chunk_data = [{"chunk": chunk, "embedding": generate_embeddings(chunk)} for chunk in chunked_text]

    # Save chunks and embeddings to a file
    save_chunks_to_file(chunk_data, output_file)

    print(f"Processing complete. Chunks saved to {output_file}")

# Example usage
process_research_paper(
    "nutrition_research_papers/nutrients-11-01136.pdf", 
    "athlyze-446917", 
    ["Health & Fitness", "Nutrition", "Sports Science", "Physiology", "Medical Sciences"], 
    "test_research_chunks.json"
)


In [None]:
import os
import json
from google.cloud import aiplatform
from google.cloud import documentai_v1 as documentai

# Set up the API credentials (ensure your Google Cloud credentials are set)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= 'preprocessing_credentials.json'
aiplatform.init(project="athlyze-446917", location="us-central1")  # Replace with your project ID and region

# Function to extract text from PDF using Document AI
def extract_text_from_pdf(file_path, project_id, location="us", processor_id="a370be5d003f980f"):
    client = documentai.DocumentProcessorServiceClient()
    processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

    with open(file_path, "rb") as pdf_file:
        pdf_content = pdf_file.read()

    raw_document = documentai.RawDocument(
        content=pdf_content,
        mime_type="application/pdf"
    )

    request = documentai.ProcessRequest(
        name=processor_name, raw_document=raw_document
    )

    result = client.process_document(request=request)
    document = result.document
    return document.text

# Function to filter relevant content
def filter_relevant_content(text, categories_to_keep):
    """Filter the content based on categories (customize as needed)."""
    return text if any(cat in text for cat in categories_to_keep) else ""

# Function to generate embeddings using Vertex AI
def generate_embeddings(text):
    """Generate embeddings using Vertex AI."""
    # Replace with your actual model and endpoint ID for embeddings
    endpoint = aiplatform.Endpoint("projects/athlyze-446917/locations/us-central1/endpoints/embedding-endpoint-id")
    response = endpoint.predict(instances=[{"content": text}])
    return response.predictions

# Function to summarize text using Vertex AI
def summarize_text(text):
    """Summarize the text using Vertex AI."""
    # Replace with your actual model and endpoint ID for summarization
    endpoint = aiplatform.Endpoint("projects/athlyze-446917/locations/us-central1/endpoints/text-bison-endpoint-id")
    response = endpoint.predict(instances=[{"content": text}])
    return response.predictions[0]['summary']

# Function to save the chunks with embeddings to a file
def save_chunks_to_file(chunks, file_path):
    """Save chunked data with embeddings to a JSON file."""
    with open(file_path, "w") as f:
        json.dump(chunks, f)

# Main function to process research paper and generate vector database
def process_research_paper(file_path, project_id, categories_to_keep, output_file):
    """Process a research paper, clean, chunk, summarize, and generate embeddings."""
    # Extract text from the paper
    text = extract_text_from_pdf(file_path, project_id)

    # Filter and clean text based on categories
    filtered_text = filter_relevant_content(text, categories_to_keep)

    # Summarize the filtered text
    summarized_text = summarize_text(filtered_text)

    # Generate embeddings for the summarized text
    embeddings = generate_embeddings(summarized_text)

    # Create chunks with embeddings
    chunk_data = [{"chunk": summarized_text, "embedding": embeddings}]

    # Save chunks with embeddings to file
    save_chunks_to_file(chunk_data, output_file)

    print(f"Processing complete. Chunks saved to {output_file}")

# Example usage
process_research_paper(
    "resistant_research_papers/2102.00836v2.pdf", 
    "athlyze-446917", 
    ["Health & Fitness", "Nutrition", "Sports Science", "Physiology", "Medical Sciences"], 
    "test_research_chunks.json"
)

In [None]:
import os
import uuid
from typing import List, Dict, Union, Optional
from dotenv import load_dotenv
from rich import print
from PyPDF2 import PdfReader
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

load_dotenv()

class AthlyzeChunker:
    def __init__(self):
        """Initialize the AthlyzeChunker with configuration."""
        self.chunks: Dict[str, Dict] = {}
        self.id_truncate_limit = 5
        self.categories = ["Health & Fitness", "Nutrition", "Sports Science", "Physiology", "Medical Sciences"]
        self.print_logging = True

        # Initialize the Google Gemini LLM
        self.llm = GoogleGenerativeAI(
            model="gemini-1.0-pro",
            google_api_key=os.getenv("GOOGLE_API_KEY"),
            temperature=0.1
        )

    def extract_text_from_pdf(self, pdf_file_path: str) -> str:
        """Extract raw text from the PDF."""
        try:
            with open(pdf_file_path, 'rb') as file:
                reader = PdfReader(file)
                return " ".join(page.extract_text() or "" for page in reader.pages)
        except Exception as e:
            print(f"[ERROR] Failed to extract text from PDF: {e}")
            return ""

    def process_and_categorize(self, text: str) -> None:
        """Extract, clean, and categorize text into meaningful sections."""
        prompt = ChatPromptTemplate.from_messages([
            ("system", "You are an assistant categorizing research into sections."),
            (
                "user",
                f"Process the following text, clean it (e.g., remove references like [1], figures, and irrelevant details), and categorize it into: {', '.join(self.categories)}. Provide key points for each category."
            ),
            ("user", f"Text: {text.replace('{', '{{').replace('}', '}}')}")
        ])
        
        try:
            response = self.llm.invoke(prompt.format_prompt().to_messages())
            cleaned_text = response.get('content', '').strip()
            if self.print_logging:
                print("[INFO] Text processed successfully.")
            self.add_chunks_by_category(cleaned_text)
        except Exception as e:
            print(f"[ERROR] Text processing failed: {e}")

    def add_chunks_by_category(self, processed_text: str) -> None:
        """Organize processed text into categories and subcategories."""
        for category in self.categories:
            category_text = self.extract_category_text(processed_text, category)
            if category_text:
                chunk_id = str(uuid.uuid4())[:self.id_truncate_limit]
                self.chunks[chunk_id] = {
                    "category": category,
                    "content": category_text,
                    "metadata": self._generate_metadata(category_text),
                    "chunk_index": len(self.chunks)
                }
                if self.print_logging:
                    print(f"[INFO] Added chunk for category '{category}': {chunk_id}")

    def extract_category_text(self, text: str, category: str) -> str:
        """Extract text specific to a category using Gemini."""
        try:
            prompt = ChatPromptTemplate.from_messages([
                ("system", f"Extract key details relevant to {category}."),
                ("user", f"Text: {text.replace('{', '{{').replace('}', '}}')}")
            ])
            response = self.llm.invoke(prompt.format_prompt().to_messages())
            return response.get('content', '').strip()
        except Exception as e:
            print(f"[ERROR] Failed to extract category text for {category}: {e}")
            return ""

    def _generate_metadata(self, content: str) -> Dict:
        """Generate metadata for a chunk."""
        return {
            "content_type": "scientific_finding",
            "creation_time": str(uuid.uuid1()),
            "source": "research_paper"
        }

    def save_chunks_to_file(self, file_path: str) -> None:
        """Save categorized chunks to a file."""
        try:
            with open(file_path, "w", encoding="utf-8") as file:
                for chunk_id, chunk in self.chunks.items():
                    file.write(f"Chunk ID: {chunk_id}\n")
                    file.write(f"Category: {chunk['category']}\n")
                    file.write(f"Content:\n{chunk['content']}\n")
                    file.write(f"Metadata: {chunk['metadata']}\n")
                    file.write("\n---\n\n")
            if self.print_logging:
                print(f"[INFO] Chunks saved to file: {file_path}")
        except Exception as e:
            print(f"[ERROR] Failed to save chunks: {e}")

# Main Function for Athlyze
def process_research_paper_for_athlyze(pdf_path: str, output_file: str) -> None:
    """Extract and categorize research data into sections for Athlyze."""
    try:
        chunker = AthlyzeChunker()
        raw_text = chunker.extract_text_from_pdf(pdf_path)
        chunker.process_and_categorize(raw_text)
        chunker.save_chunks_to_file(output_file)
        print(f"[INFO] Processing completed. Results saved to: {output_file}")
    except Exception as e:
        print(f"[ERROR] Failed to process research paper: {e}")

# Example Usage
pdf_path = "resistant_research_papers/2102.00836v2.pdf"  # Replace with your PDF path
output_file = "athlyze_chunks.txt"
process_research_paper_for_athlyze(pdf_path, output_file)


In [25]:
from typing import Dict, List, Optional
from dataclasses import dataclass
import pandas as pd
import numpy as np
import PyPDF2
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import vertexai
from vertexai.language_models import TextGenerationModel 
from rich.console import Console
from rich.table import Table
import os
import json
from dotenv import load_dotenv


os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= 'preprocessing_credentials.json'
load_dotenv()

print("Libraries loaded successfully")

@dataclass
class ProcessedChunk:
    text: str
    use_case: str
    metadata: Dict
    section_type: str
    relevance_score: float
    key_findings: List[str]
    citations: List[str]
    methodology_details: Optional[Dict]

class SemanticChunker:
    def __init__(self):
        self.tfidf = TfidfVectorizer(
            stop_words='english',
            max_features=1000,
            ngram_range=(1, 2)
        )

    def get_semantic_similarity(self, text1: str, text2: str) -> float:
        """Calculate semantic similarity between two text chunks."""
        tfidf_matrix = self.tfidf.fit_transform([text1, text2])
        return (tfidf_matrix * tfidf_matrix.T).A[0, 1]

    def merge_similar_chunks(self, chunks: List[str], similarity_threshold: float = 0.3) -> List[str]:
        """Merge chunks that are semantically similar."""
        merged_chunks = []
        current_chunk = chunks[0]

        for next_chunk in chunks[1:]:
            similarity = self.get_semantic_similarity(current_chunk, next_chunk)
            if similarity > similarity_threshold:
                current_chunk = f"{current_chunk}\n{next_chunk}"
            else:
                merged_chunks.append(current_chunk)
                current_chunk = next_chunk

        merged_chunks.append(current_chunk)
        return merged_chunks

class AcademicPaperProcessor:
    def __init__(
        self,
        project_id: str,
        location: str = "us-central1",
        chunk_size: int = 1000,
        chunk_overlap: int = 200
    ):
        self.project_id = project_id
        vertexai.init(project=project_id, location=location)
        self.model = TextGenerationModel.from_pretrained("text-bison@001")
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n## ", "\n\n", "\n", ". ", " ", ""]
        )
        self.semantic_chunker = SemanticChunker()

    def extract_metadata(self, text: str) -> Dict:
        """Extract metadata from paper header/title section."""
        prompt = """You are an expert academic research parser. Extract comprehensive metadata from this academic paper text.
        Focus on accuracy and completeness.

        Required fields:
        1. Title (exact paper title)
        2. Authors (full list with affiliations if available)
        3. Publication details:
           - Year
           - Journal/Conference
           - DOI
           - Volume/Issue
        4. Keywords (if present)
        5. Research domain/field

        Format the response as a valid Python dictionary.

        Text to analyze:
        {text}
        """

        response = self.model.predict(
            prompt.format(text=text[:3000]),
            temperature=0.1,
            max_output_tokens=1024,
        )

        try:
            return eval(response.text)
        except:
            return {"title": "Unknown", "authors": [], "year": None}

    def analyze_chunk(self, chunk: str, metadata: Dict) -> ProcessedChunk:
        """Analyze chunk content with enhanced agentic understanding."""
        prompt = """You are an expert research analyst. Analyze this academic paper excerpt.

        Task: Extract and structure the following components:

        1. Core Scientific Content:
           - Main findings or theoretical concepts
           - Methodologies or approaches
           - Evidence supporting claims

        2. Practical Applications:
           - How this information can be applied

        3. Critical Analysis:
           - Scientific validity (0-1)
           - Practical applicability (0-1)
           - Identify limitations or constraints

        Format response as a Python dictionary with keys:
        {
            'text': 'core scientific content',
            'use_case': 'practical applications',
            'section_type': 'type of section',
            'relevance_score': float,
            'key_findings': [list of findings],
            'citations': [list of referenced papers],
            'methodology_details': {dict of methods}
        }

        Text to analyze:
        {chunk}
        """

        response = self.model.predict(
            prompt.format(chunk=chunk),
            temperature=0.1,
            max_output_tokens=2048,
        )

        try:
            result = eval(response.text)
            return ProcessedChunk(
                text=result['text'],
                use_case=result['use_case'],
                metadata=metadata,
                section_type=result['section_type'],
                relevance_score=result['relevance_score'],
                key_findings=result['key_findings'],
                citations=result['citations'],
                methodology_details=result['methodology_details']
            )
        except Exception as e:
            print(f"Error processing chunk: {e}")
            return ProcessedChunk(
                text=chunk,
                use_case="",
                metadata=metadata,
                section_type="unknown",
                relevance_score=0.0,
                key_findings=[],
                citations=[],
                methodology_details=None
            )

    def agentic_chunk_text(self, text: str) -> List[str]:
        """Perform intelligent chunking based on semantic meaning."""
        section_pattern = r'\n#{1,3}\s+[A-Z].*?\n'
        sections = re.split(section_pattern, text)

        chunks = []
        for section in sections:
            initial_chunks = self.text_splitter.create_documents([section])
            chunk_texts = [chunk.page_content for chunk in initial_chunks]
            merged_chunks = self.semantic_chunker.merge_similar_chunks(chunk_texts)
            chunks.extend(merged_chunks)

        return chunks

    def process_pdf(self, pdf_path: str) -> List[ProcessedChunk]:
        """Process a PDF file with enhanced chunking."""
        with open(pdf_path, "rb") as file:
            doc = PyPDF2.PdfReader(file)  # Pass the file object, not just the path
            text = ""
            for page in doc.pages:  # Iterate over the pages
                text += page.extract_text()  # Use extract_text()

        metadata = self.extract_metadata(text)
        chunks = self.agentic_chunk_text(text)

        processed_chunks = []
        for chunk in chunks:
            processed_chunk = self.analyze_chunk(chunk, metadata)
            if processed_chunk.relevance_score > 0.6:
                processed_chunks.append(processed_chunk)

        return processed_chunks


    def vectorize_chunks(self, chunks: List[ProcessedChunk]) -> List[Dict]:
        """Enhanced vectorization with semantic understanding."""
        vectorized_chunks = []
        for chunk in chunks:
            combined_text = f"""
            Content: {chunk.text}

            Key Findings: {' | '.join(chunk.key_findings)}

            Practical Applications: {chunk.use_case}

            Methodology: {chunk.methodology_details if chunk.methodology_details else 'Not specified'}

            Section Type: {chunk.section_type}
            """

            metadata = {
                **chunk.metadata,
                "section_type": chunk.section_type,
                "relevance_score": chunk.relevance_score,
                "use_case": chunk.use_case,
                "key_findings": chunk.key_findings,
                "citations": chunk.citations,
                "methodology_details": chunk.methodology_details
            }

            vectorized_chunks.append({
                "text": combined_text,
                "metadata": metadata
            })

        return vectorized_chunks

class ChunkInspector:
    def __init__(self, processor: AcademicPaperProcessor):
        self.processor = processor
        self.console = Console()

    def inspect_chunks(self, pdf_path: str, max_chunks: int = 5):
        """Inspect chunks and embeddings before upload."""
        processed_chunks = self.processor.process_pdf(pdf_path)
        vectors = self.processor.vectorize_chunks(processed_chunks)

        self.console.print("\n[bold blue]Chunk Processing Summary:[/bold blue]")
        self.console.print(f"Total chunks extracted: {len(vectors)}")

        self.console.print("\n[bold blue]Sample Chunks:[/bold blue]")
        table = Table(show_header=True, header_style="bold magenta")
        table.add_column("Chunk #", style="dim")
        table.add_column("Text Preview")
        table.add_column("Relevance Score")
        table.add_column("Section Type")
        table.add_column("Key Findings Count")

        for i, vector in enumerate(vectors[:max_chunks]):
            metadata = vector['metadata']
            table.add_row(
                str(i + 1),
                vector['text'][:100] + "...",
                f"{metadata['relevance_score']:.2f}",
                metadata['section_type'],
                str(len(metadata.get('key_findings', [])))
            )

        self.console.print(table)

    def export_to_csv(self, vectors: List[Dict], output_path: str):
        """Export chunks and metadata to CSV."""
        records = []
        for vector in vectors:
            metadata = vector['metadata']
            records.append({
                'text_preview': vector['text'][:200],
                'section_type': metadata['section_type'],
                'relevance_score': metadata['relevance_score'],
                'key_findings_count': len(metadata.get('key_findings', []))
            })

        df = pd.DataFrame(records)
        df.to_csv(output_path, index=False)
        self.console.print(f"\n[green]Exported analysis to {output_path}[/green]")

def main():
    processor = AcademicPaperProcessor(
        project_id="athlyze-446917",
        location="us-central1"
    )
    inspector = ChunkInspector(processor)

    pdf_path = "resistant_research_papers/2102.00836v2.pdf"
    inspector.inspect_chunks(pdf_path)

if __name__ == "__main__":
    main()


Libraries loaded successfully


FailedPrecondition: 400 Project `543798683069` is not allowed to use Publisher Model `projects/athlyze-446917/locations/us-central1/publishers/google/models/text-bison@001`

In [32]:
from google.cloud import aiplatform

print(aiplatform.init(project="athlyze-446917", location="us-central1"))
models = aiplatform.Model.list()
for model in models:
    print(model.resource_name, model.display_name)


None
