In [1]:
!pip install langchain langchain-community tiktoken chromadb openai pypdf sentence-transformers --quiet


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.7/615.7 kB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m


In [5]:
!wget https://constitutioncenter.org/media/files/constitution.pdf

--2024-11-04 01:23:03--  https://constitutioncenter.org/media/files/constitution.pdf
Resolving constitutioncenter.org (constitutioncenter.org)... 104.22.22.181, 172.67.42.106, 104.22.23.181, ...
Connecting to constitutioncenter.org (constitutioncenter.org)|104.22.22.181|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 413949 (404K) [application/pdf]
Saving to: ‘constitution.pdf’


2024-11-04 01:23:03 (50.7 MB/s) - ‘constitution.pdf’ saved [413949/413949]



# Reading Legal Documents

In [6]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import os

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = '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'

# Step 1: Load the PDF and split it into manageable chunks
def load_and_split_pdf(pdf_file_path):
    loader = PyPDFLoader(pdf_file_path)
    documents = loader.load()

    # Split the document into smaller chunks for better processing
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = text_splitter.split_documents(documents)
    return texts

# Step 2: Store embeddings in ChromaDB
def store_embeddings_in_chromadb(documents):
    embeddings = OpenAIEmbeddings()  # You can also use SentenceTransformer embeddings if preferred
    vectorstore = Chroma.from_documents(documents, embeddings)
    return vectorstore

# Step 3: Setup a retriever and chain for querying and context retrieval
def create_retrieval_chain(vectorstore):
    llm = OpenAI(model="gpt-3.5-turbo-instruct",openai_api_key=os.environ["OPENAI_API_KEY"])
    retrieval_qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",  # Optionally use "map_reduce" or others for more advanced chaining
        retriever=vectorstore.as_retriever()
    )
    return retrieval_qa

# Main Program
if __name__ == "__main__":
    # Sample PDF file path
    pdf_file_path = 'constitution.pdf'

    # Step 1: Load and split the PDF into chunks
    print("Loading and splitting PDF...")
    documents = load_and_split_pdf(pdf_file_path)

    # Step 2: Store document embeddings in ChromaDB
    print("Storing embeddings in ChromaDB...")
    vectorstore = store_embeddings_in_chromadb(documents)

    # Step 3: Create a retrieval-based chain with LLM
    retrieval_qa = create_retrieval_chain(vectorstore)

    # Step 4: User query input
    user_query = input("Enter your query: ")

    # Step 5: Retrieve context and generate answer using LLM
    answer = retrieval_qa.run(user_query)
    print(f"LLM Response: {answer}")


Loading and splitting PDF...
Storing embeddings in ChromaDB...


  embeddings = OpenAIEmbeddings()  # You can also use SentenceTransformer embeddings if preferred
  llm = OpenAI(model="gpt-3.5-turbo-instruct",openai_api_key=os.environ["OPENAI_API_KEY"])


Enter your query: WHat is the second ammendment?


  answer = retrieval_qa.run(user_query)


LLM Response:  The second amendment to the Constitution guarantees the right of citizens to keep and bear arms.


# Financial Reports Analyzer

In [9]:
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import torch
import re
from typing import List, Dict, Tuple, Optional, Set
from pathlib import Path
import logging
from datetime import datetime
from dataclasses import dataclass
from decimal import Decimal, InvalidOperation

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class FinancialMetric:
    """Data class to store financial metric information"""
    name: str
    value: float
    unit: str
    period: str
    context: str

class FinancialMetricsExtractor:
    """Class for extracting and calculating financial metrics"""

    # Define common financial metrics patterns
    METRIC_PATTERNS = {
        'revenue': r'(?:revenue|sales)(?:\s+of)?\s+\$?([\d,]+(?:\.\d+)?)\s*(?:billion|million|B|M)?',
        'profit_margin': r'(?:profit\s+margin|margin)\s+(?:of\s+)?(\d+(?:\.\d+)?)\%',
        'operating_margin': r'operating\s+margin\s+(?:of\s+)?(\d+(?:\.\d+)?)\%',
        'net_income': r'(?:net\s+income|profit)\s+(?:of\s+)?\$?([\d,]+(?:\.\d+)?)\s*(?:billion|million|B|M)?',
        'ebitda': r'(?:EBITDA|ebitda)\s+(?:of\s+)?\$?([\d,]+(?:\.\d+)?)\s*(?:billion|million|B|M)?',
        'eps': r'(?:EPS|earnings\s+per\s+share)\s+(?:of\s+)?\$?([\d,]+(?:\.\d+)?)',
        'pe_ratio': r'(?:P/E|price[/-]to[/-]earnings)\s+(?:ratio\s+)?(?:of\s+)?(\d+(?:\.\d+)?)',
        'debt_equity': r'debt[/-]to[/-]equity\s+(?:ratio\s+)?(?:of\s+)?(\d+(?:\.\d+)?)',
        'current_ratio': r'current\s+ratio\s+(?:of\s+)?(\d+(?:\.\d+)?)',
        'quick_ratio': r'quick\s+ratio\s+(?:of\s+)?(\d+(?:\.\d+)?)',
        'roe': r'(?:ROE|return\s+on\s+equity)\s+(?:of\s+)?(\d+(?:\.\d+)?)\%',
        'roa': r'(?:ROA|return\s+on\s+assets)\s+(?:of\s+)?(\d+(?:\.\d+)?)\%',
        'cash_flow': r'(?:cash\s+flow|operating\s+cash\s+flow)\s+(?:of\s+)?\$?([\d,]+(?:\.\d+)?)\s*(?:billion|million|B|M)?',
    }

    # Unit multipliers
    UNIT_MULTIPLIERS = {
        'billion': 1e9,
        'B': 1e9,
        'million': 1e6,
        'M': 1e6
    }

    @staticmethod
    def normalize_number(value: str, unit: str = '') -> float:
        """
        Normalize numerical values to a standard format

        Args:
            value (str): The numerical value as string
            unit (str): The unit of the value (billion, million, etc.)

        Returns:
            float: Normalized value
        """
        try:
            # Remove commas and convert to float
            clean_value = float(value.replace(',', ''))

            # Apply unit multiplier if present
            multiplier = FinancialMetricsExtractor.UNIT_MULTIPLIERS.get(unit, 1)
            return clean_value * multiplier
        except (ValueError, TypeError) as e:
            logger.error(f"Error normalizing number {value}: {str(e)}")
            return 0.0

    @staticmethod
    def extract_metrics(text: str) -> List[FinancialMetric]:
        """
        Extract financial metrics from text

        Args:
            text (str): Text to analyze

        Returns:
            List[FinancialMetric]: List of extracted financial metrics
        """
        metrics = []

        # Extract year or period information
        period_match = re.search(r'(?:fiscal\s+year|FY|Q[1-4])?\s*(?:20\d{2})', text)
        period = period_match.group(0) if period_match else "Unknown"

        for metric_name, pattern in FinancialMetricsExtractor.METRIC_PATTERNS.items():
            matches = re.finditer(pattern, text, re.IGNORECASE)

            for match in matches:
                value_str = match.group(1)

                # Determine unit
                unit = ''
                for unit_name in FinancialMetricsExtractor.UNIT_MULTIPLIERS.keys():
                    if unit_name in text[match.start():match.end()]:
                        unit = unit_name
                        break

                # Normalize value
                value = FinancialMetricsExtractor.normalize_number(value_str, unit)

                # Get context (surrounding text)
                start_idx = max(0, match.start() - 50)
                end_idx = min(len(text), match.end() + 50)
                context = text[start_idx:end_idx].strip()

                metrics.append(FinancialMetric(
                    name=metric_name,
                    value=value,
                    unit=unit,
                    period=period,
                    context=context
                ))

        return metrics

class FinancialRatioCalculator:
    """Class for calculating financial ratios from extracted metrics"""

    @staticmethod
    def calculate_ratios(metrics: List[FinancialMetric]) -> Dict[str, float]:
        """
        Calculate financial ratios from extracted metrics

        Args:
            metrics (List[FinancialMetric]): List of extracted metrics

        Returns:
            Dict[str, float]: Calculated ratios
        """
        # Convert metrics to dictionary for easier access
        metric_dict = {metric.name: metric.value for metric in metrics}
        ratios = {}

        # Gross Margin
        if all(key in metric_dict for key in ['revenue', 'cost_of_goods']):
            ratios['gross_margin'] = (metric_dict['revenue'] - metric_dict['cost_of_goods']) / metric_dict['revenue']

        # Operating Margin
        if all(key in metric_dict for key in ['operating_income', 'revenue']):
            ratios['operating_margin'] = metric_dict['operating_income'] / metric_dict['revenue']

        # Return on Equity (ROE)
        if all(key in metric_dict for key in ['net_income', 'shareholders_equity']):
            ratios['roe'] = metric_dict['net_income'] / metric_dict['shareholders_equity']

        return ratios


class FinancialReportAnalyzer:
    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
        try:
            self.embedding_model = SentenceTransformer(model_name)
            self.document_chunks: List[str] = []
            self.embeddings: Optional[np.ndarray] = None
            self.metadata: Dict = {}
            self.metrics_extractor = FinancialMetricsExtractor()
            self.ratio_calculator = FinancialRatioCalculator()
            logger.info(f"Initialized FinancialReportAnalyzer with model: {model_name}")
        except Exception as e:
            logger.error(f"Failed to initialize model: {str(e)}")
            raise RuntimeError(f"Model initialization failed: {str(e)}")

    def preprocess_document(self, text: str, chunk_size: int = 500) -> List[str]:
        """
        Preprocess and chunk the financial report text.

        Args:
            text (str): Raw text from financial report
            chunk_size (int): Maximum size of each text chunk

        Returns:
            List[str]: List of preprocessed text chunks

        Raises:
            ValueError: If text is empty or chunk_size is invalid
        """
        if not text or not isinstance(text, str):
            raise ValueError("Input text must be a non-empty string")
        if chunk_size <= 0:
            raise ValueError("Chunk size must be positive")

        # Remove extra whitespace and normalize text
        text = re.sub(r'\s+', ' ', text)

        # Preserve important financial characters and formatting
        text = re.sub(r'[^\w\s.,;:()$%+-]', '', text)

        # Improved sentence splitting that preserves decimal numbers and currency
        sentences = re.split(r'(?<!\d)\.(?!\d)(?!\s*[a-z])', text)
        chunks = []
        current_chunk = ""

        # Enhanced chunking logic that preserves context and financial information
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            # Check if adding this sentence would exceed chunk size
            if len(current_chunk) + len(sentence) < chunk_size:
                current_chunk += sentence + ". "
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence + ". "

        if current_chunk:
            chunks.append(current_chunk.strip())

        # Validate chunks
        if not chunks:
            raise ValueError("No valid chunks were generated from the input text")

        logger.info(f"Created {len(chunks)} chunks from document")
        return chunks

    def create_embeddings(self, chunks: List[str]) -> None:
        """
        Create embeddings for all document chunks.

        Args:
            chunks (List[str]): List of text chunks to embed

        Raises:
            ValueError: If chunks list is empty
        """
        if not chunks:
            raise ValueError("Cannot create embeddings from empty chunks list")

        try:
            self.document_chunks = chunks
            self.embeddings = self.embedding_model.encode(chunks, show_progress_bar=True)
            logger.info(f"Created embeddings of shape: {self.embeddings.shape}")
        except Exception as e:
            logger.error(f"Failed to create embeddings: {str(e)}")
            raise

    def retrieve_relevant_chunks(self, query: str, n_chunks: int = 3,
                               similarity_threshold: float = 0.3) -> List[Tuple[str, float]]:
        """
        Retrieve the most relevant chunks for a given query.

        Args:
            query (str): User query about financial information
            n_chunks (int): Number of relevant chunks to retrieve
            similarity_threshold (float): Minimum similarity score to include chunk

        Returns:
            List[Tuple[str, float]]: List of (chunk, similarity_score) tuples
        """
        if not query.strip():
            raise ValueError("Query cannot be empty")
        if self.embeddings is None:
            raise ValueError("No embeddings available. Please load a document first.")

        try:
            query_embedding = self.embedding_model.encode([query])[0]
            similarities = cosine_similarity([query_embedding], self.embeddings)[0]

            # Filter by similarity threshold and get top chunks
            valid_indices = np.where(similarities >= similarity_threshold)[0]
            top_indices = valid_indices[np.argsort(similarities[valid_indices])[-n_chunks:][::-1]]

            results = [(self.document_chunks[i], similarities[i]) for i in top_indices]
            logger.info(f"Retrieved {len(results)} relevant chunks for query")
            return results
        except Exception as e:
            logger.error(f"Error retrieving chunks: {str(e)}")
            raise

    def analyze_query(self, query: str) -> Dict:
        """
        Analyze a specific query about the financial report.

        Args:
            query (str): User query about financial information

        Returns:
            Dict: Analysis results including relevant chunks, metrics, and ratios
        """
        if not query.strip():
            raise ValueError("Query cannot be empty")

        try:
            relevant_chunks = self.retrieve_relevant_chunks(query)

            # Extract metrics from relevant chunks
            all_metrics = []
            for chunk, _ in relevant_chunks:
                metrics = FinancialMetricsExtractor.extract_metrics(chunk)
                all_metrics.extend(metrics)

            # Calculate relevant ratios
            ratios = FinancialRatioCalculator.calculate_ratios(all_metrics)

            # Organize metrics by type
            metrics_by_type = {}
            for metric in all_metrics:
                if metric.name not in metrics_by_type:
                    metrics_by_type[metric.name] = []
                metrics_by_type[metric.name].append({
                    'value': metric.value,
                    'unit': metric.unit,
                    'period': metric.period,
                    'context': metric.context
                })

            analysis_result = {
                "query": query,
                "relevant_sections": [
                    {
                        "text": chunk,
                        "relevance_score": float(score),
                    }
                    for chunk, score in relevant_chunks
                ],
                "extracted_metrics": metrics_by_type,
                "calculated_ratios": ratios,
                "metadata": self.metadata
            }

            return analysis_result
        except Exception as e:
            logger.error(f"Error analyzing query: {str(e)}")
            raise



def main():
    """Example usage with financial metrics extraction"""
    try:
        analyzer = FinancialReportAnalyzer()

        sample_report = """
        In fiscal year 2023, our company achieved strong financial performance with revenue growth of 15% year-over-year,
        reaching $2.5 billion. Our operating margin improved to 28%, up from 25% in the previous year.
        The company's cash flow from operations was $500 million, representing a 20% increase from 2022.

        Our EBITDA reached $750 million with a margin of 30%. The P/E ratio stands at 22.5, reflecting strong market confidence.
        Net income increased to $400 million, resulting in earnings per share (EPS) of $2.45.

        Return on equity (ROE) improved to 18.5% while return on assets (ROA) reached 12.3%.
        Our debt-to-equity ratio decreased to 0.8, showing improved financial health.
        """

        chunks = analyzer.preprocess_document(sample_report)
        analyzer.create_embeddings(chunks)

        # Example queries focusing on financial metrics
        queries = [
            "What are the company's profitability metrics?",
            "What are the key financial ratios?",
            "How is the company's operational performance?"
        ]

        for query in queries:
            result = analyzer.analyze_query(query)
            print(f"\nQuery: {result['query']}")
            print("\nExtracted Metrics:")
            for metric_type, metrics in result['extracted_metrics'].items():
                print(f"\n{metric_type.upper()}:")
                for metric in metrics:
                    print(f"- Value: {metric['value']}")
                    print(f"  Period: {metric['period']}")
                    print(f"  Context: {metric['context']}")

            if result['calculated_ratios']:
                print("\nCalculated Ratios:")
                for ratio_name, value in result['calculated_ratios'].items():
                    print(f"- {ratio_name}: {value:.2f}")

    except Exception as e:
        logger.error(f"Error in main: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: What are the company's profitability metrics?

Extracted Metrics:

PROFIT_MARGIN:
- Value: 30.0
  Period: fiscal year 2023
  Context: from 2022. Our EBITDA reached $750 million with a margin of 30%. The PE ratio stands at 22.5, reflecting strong m

Query: What are the key financial ratios?

Extracted Metrics:

PROFIT_MARGIN:
- Value: 30.0
  Period: fiscal year 2023
  Context: from 2022. Our EBITDA reached $750 million with a margin of 30%. The PE ratio stands at 22.5, reflecting strong m

Query: How is the company's operational performance?

Extracted Metrics:

PROFIT_MARGIN:
- Value: 30.0
  Period: fiscal year 2023
  Context: from 2022. Our EBITDA reached $750 million with a margin of 30%. The PE ratio stands at 22.5, reflecting strong m


# Scientific Document Summary

In [17]:
from typing import Dict, List, Optional, Tuple
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
import re
from dataclasses import dataclass
import torch
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class RelevantPassage:
    section: str
    text: str
    similarity: float

class ResearchPaperRAG:
    def __init__(self,
                 embedding_model: str = 'all-MiniLM-L6-v2',
                 summarizer_model: str = "facebook/bart-large-cnn"):
        """Initialize the RAG system with specified models."""
        try:
            # Initialize models without explicit device setting first
            self.embedding_model = SentenceTransformer(embedding_model)

            # Initialize summarizer pipeline
            self.summarizer = pipeline(
                "summarization",
                model=summarizer_model
            )

            logger.info("Models initialized successfully")

        except Exception as e:
            logger.error(f"Model initialization failed: {str(e)}")
            raise RuntimeError(f"Error initializing models: {str(e)}")

    def get_embeddings(self, text_chunks: List[str]) -> np.ndarray:
        """Generate embeddings for text chunks."""
        if not text_chunks:
            raise ValueError("Text chunks list cannot be empty")

        try:
            return self.embedding_model.encode(
                text_chunks,
                show_progress_bar=False,
                convert_to_numpy=True
            )
        except Exception as e:
            logger.error(f"Embedding generation failed: {str(e)}")
            raise RuntimeError(f"Error generating embeddings: {str(e)}")

    def preprocess_paper(self, paper_text: str) -> Dict[str, str]:
        """Extract sections from the research paper."""
        if not paper_text or not isinstance(paper_text, str):
            raise ValueError("Paper text must be a non-empty string")

        # Define section patterns with variations
        section_patterns = {
            'abstract': r'abstract|summary',
            'introduction': r'introduction|background',
            'methodology': r'methodology|methods|materials and methods|experimental procedure',
            'results': r'results|findings',
            'discussion': r'discussion|interpretation',
            'conclusion': r'conclusion|conclusions|final remarks',
            'limitations': r'limitations|study limitations|constraints'
        }

        sections = {k: '' for k in section_patterns.keys()}
        current_section = None
        current_text = []

        lines = paper_text.split('\n')
        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Check if line is a section header
            found_section = False
            for section, pattern in section_patterns.items():
                if re.match(f"^(?i)({pattern})\s*:?$", line):
                    if current_section and current_text:
                        sections[current_section] += ' '.join(current_text)
                    current_section = section
                    current_text = []
                    found_section = True
                    break

            if not found_section and current_section:
                current_text.append(line)

        # Add the last section
        if current_section and current_text:
            sections[current_section] += ' '.join(current_text)

        # Clean up empty sections and normalize whitespace
        return {k: ' '.join(v.split()) for k, v in sections.items() if v.strip()}

    def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
        """Split text into chunks while preserving sentence boundaries."""
        if not text:
            return []

        sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])|(?<=\n)\s*(?=[A-Z])', text)
        chunks = []
        current_chunk = []
        current_size = 0

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            sentence_size = len(sentence)

            if sentence_size > chunk_size:
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                    current_chunk = []
                    current_size = 0

                clauses = re.split(r'(?<=[,;:])\s+', sentence)
                for clause in clauses:
                    if len(clause) > chunk_size:
                        chunks.append(clause)
                    elif current_size + len(clause) > chunk_size:
                        chunks.append(' '.join(current_chunk))
                        current_chunk = [clause]
                        current_size = len(clause)
                    else:
                        current_chunk.append(clause)
                        current_size += len(clause)
                continue

            if current_size + sentence_size > chunk_size and current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_size = sentence_size
            else:
                current_chunk.append(sentence)
                current_size += sentence_size

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return [chunk for chunk in chunks if chunk.strip()]

    def retrieve_relevant_passages(self,
                                 query: str,
                                 sections: Dict[str, str],
                                 top_k: int = 3,
                                 similarity_threshold: float = 0.2) -> List[RelevantPassage]:
        """Retrieve most relevant passages based on query."""
        if not query or not sections:
            raise ValueError("Query and sections cannot be empty")

        relevant_passages = []
        query_embedding = self.embedding_model.encode([query])[0]

        for section_name, section_text in sections.items():
            chunks = self.chunk_text(section_text)
            if not chunks:
                continue

            chunk_embeddings = self.get_embeddings(chunks)
            similarities = cosine_similarity([query_embedding], chunk_embeddings)[0]

            for idx, chunk in enumerate(chunks):
                if similarities[idx] > similarity_threshold:
                    relevant_passages.append(RelevantPassage(
                        section=section_name,
                        text=chunk,
                        similarity=float(similarities[idx])
                    ))

        # Sort by similarity and return top_k
        relevant_passages.sort(key=lambda x: x.similarity, reverse=True)
        return relevant_passages[:top_k]

    def generate_summary(self, passages: List[RelevantPassage], query: Optional[str] = None) -> str:
        """Generate a summary based on retrieved passages."""
        if not passages:
            return "No relevant passages found to summarize."

        try:
            combined_text = " ".join([p.text for p in passages])
            input_length = len(combined_text.split())

            max_length = min(150, max(50, input_length // 2))
            min_length = max(25, input_length // 4)

            summary = self.summarizer(
                combined_text,
                max_length=max_length,
                min_length=min_length,
                do_sample=False,
                truncation=True
            )[0]['summary_text']

            return summary
        except Exception as e:
            logger.error(f"Error in summary generation: {str(e)}")
            return "Error generating summary. Showing most relevant passage instead: " + passages[0].text

    def query_paper(self, paper_text: str, query: str, top_k: int = 3) -> Dict[str, any]:
        """Main method to query a paper."""
        if not paper_text or not query:
            raise ValueError("Paper text and query must be non-empty strings")

        try:
            sections = self.preprocess_paper(paper_text)

            if not sections:
                raise ValueError("No valid sections found in paper")

            relevant_passages = self.retrieve_relevant_passages(
                query=query,
                sections=sections,
                top_k=top_k
            )

            summary = self.generate_summary(relevant_passages, query)

            return {
                'summary': summary,
                'relevant_passages': relevant_passages
            }

        except Exception as e:
            logger.error(f"Error processing paper: {str(e)}")
            raise RuntimeError(f"Error processing paper: {str(e)}")

def main():
    """Example usage of the ResearchPaperRAG system."""
    try:
        # Sample paper with more detailed methodology section
        sample_paper = """
        Abstract:
        This comprehensive study investigates the effects of caffeine on cognitive performance and reaction time in healthy adults.

        Introduction:
        Caffeine is one of the most widely consumed psychoactive substances globally.

        Methodology:
        We conducted a double-blind, placebo-controlled study with 100 participants aged 18-65. Participants were randomly assigned to either the caffeine group (200mg) or placebo group. We used a computerized cognitive assessment battery to measure performance. Testing sessions occurred at baseline and 1 hour post-administration. Each participant completed three testing blocks: memory tasks, reaction time assessment, and attention span measurement.

        Results:
        The results showed significant improvement in reaction times (p<0.001).

        Discussion:
        Our findings suggest that moderate caffeine consumption enhances cognitive performance.

        Limitations:
        The study was limited by its single-dose design and short duration.

        Conclusion:
        This study demonstrates the positive effects of caffeine on cognitive performance.
        """

        rag = ResearchPaperRAG()

        queries = [
            "What were the main findings about caffeine's effects?",
            "Describe the methodology used in the study",
            "What were the limitations of the study?"
        ]

        for query in queries:
            print(f"\nQuery: {query}")
            result = rag.query_paper(sample_paper, query)
            print("\nSummary:", result['summary'])
            print("\nRelevant Passages:")
            for passage in result['relevant_passages']:
                print(f"\nFrom {passage.section} section (similarity: {passage.similarity:.2f}):")
                print(passage.text)

    except Exception as e:
        print(f"Error in main: {str(e)}")

if __name__ == "__main__":
    main()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Your max_length is set to 50, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)



Query: What were the main findings about caffeine's effects?

Summary: Caffeine is one of the most widely consumed psychoactive substances globally. This study demonstrates the positive effects of caffeine on cognitive performance.

Relevant Passages:

From conclusion section (similarity: 0.69):
This study demonstrates the positive effects of caffeine on cognitive performance.

From abstract section (similarity: 0.64):
This comprehensive study investigates the effects of caffeine on cognitive performance and reaction time in healthy adults.

From introduction section (similarity: 0.62):
Caffeine is one of the most widely consumed psychoactive substances globally.

Query: Describe the methodology used in the study

Summary: We conducted a double-blind, placebo-controlled study with 100 participants aged 18-65. Participants were randomly assigned to either the caffeine group (200mg) or placebo group. We used a computerized cognitive assessment battery to measure performance

Relevant Pa