In [1]:

# 2. Import all necessary packages
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import os

# 3. Document Loading with Metadata
def load_quarter_specific(folder_path, year="2023", quarter="Q2"):
    """Load documents for specific quarter with strict filtering"""
    quarter_docs = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf") and f"{year}" in file and f"Q{quarter[-1]}" in file:
            try:
                loader = PyPDFLoader(os.path.join(folder_path, file))
                pages = loader.load_and_split()
                for page in pages:
                    page.metadata = {
                        "source": file,
                        "page": page.metadata.get("page", ""),
                        "year": year,
                        "quarter": quarter
                    }
                quarter_docs.extend(pages)
            except Exception as e:
                print(f"Error loading {file}: {str(e)}")
    return quarter_docs

# Load documents
folder_path = r"D:\4-IntoCode\16_LangChain\AgilProjekt_multiModel\Raw_Data\Apple"
q2_2023_docs = load_quarter_specific(folder_path, "2023", "Q2")

# Split documents
# 1. Modify the text splitter for better chunking
q2_2023_docs = load_quarter_specific(folder_path, "2023", "Q2")
splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,  # Smaller chunks for precision
    chunk_overlap=100,
    separators=["\n\n", "\n", r"(?<=\. )", " "]  # Better sentence preservation
)
q2_chunks = splitter.split_documents(q2_2023_docs)

# 4. Create Vector Store
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(
    documents=q2_chunks,
    embedding=embedding_model,
    persist_directory="./chroma_q2_2023"
)

# 5. Initialize QA System
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation_side="left")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def is_chunk_valid(text):
    tokens = tokenizer(text, return_tensors="pt").input_ids
    return tokens.shape[1] <= 512

# 3. Enhanced document loader with validation
def load_valid_chunks(folder_path, target_year="2023"):
    valid_chunks = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf") and str(target_year) in file:
            loader = PyPDFLoader(os.path.join(folder_path, file))
            pages = loader.load_and_split(text_splitter=splitter)
            for page in pages:
                if is_chunk_valid(page.page_content):
                    page.metadata = {
                        "source": file,
                        "page": page.metadata.get("page", ""),
                        "year": target_year
                    }
                    valid_chunks.append(page)
                else:
                    print(f"Oversized chunk in {file}, splitting further")
                    # Additional splitting if needed
                    sub_chunks = splitter.split_text(page.page_content)
                    for i, chunk in enumerate(sub_chunks):
                        if is_chunk_valid(chunk):
                            new_page = page.copy()
                            new_page.page_content = chunk
                            new_page.metadata["subchunk"] = i
                            valid_chunks.append(new_page)
    return valid_chunks

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=300,
    temperature=0.3,
    do_sample=True,
    truncation=True,
    no_repeat_ngram_size=2,  # Better than repetition_penalty for FLAN-T5
)


llm = HuggingFacePipeline(pipeline=pipe)

# 6. Configure QA Chain with proper prompt
# Use simpler 'stuff' chain type for reliability
# Define your QA chain with proper filter syntax and MMR retrieval
# Enhanced prompt template with strict formatting requirements
prompt_template = """You are a financial analyst specializing in Apple Inc. Generate a comprehensive Q2 2023 performance report using ONLY the provided context. Follow this exact structure:

# Apple Q2 2023 Performance Analysis

## Executive Summary
[3-4 sentence overview highlighting key performance metrics and trends]

## Financial Performance
### Revenue
- Total Revenue: $X.XB (X% change YoY)
  - iPhone: $X.XB (X%)
  - Mac: $X.XB (X%)
  - Services: $X.XB (X%)
  - Other Products: $X.XB (X%)

### Profitability
- Gross Margin: X.X%
- Operating Margin: X.X%
- Net Income: $X.XB

## Product Highlights
[Bullet points of key product announcements/updates]

## Market Context
[Brief analysis of market conditions and competitive landscape]

## Sources
[Cite all numbers using [Page X] format]

Context:
{context}"""

# Create the QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(
        search_type="mmr",
        search_kwargs={
            "k": 5,
            "score_threshold": 0.7,
            "filter": {
                "$and": [
                    {"quarter": "Q2"},
                    {"year": "2023"}
                ]
            }
        }
    ),
    chain_type="stuff",
    chain_type_kwargs={
        "prompt": PromptTemplate(
            template=prompt_template,
            input_variables=["context"]
        )
    },
    return_source_documents=True
)

# Execute with error handling
try:
    response = qa_chain.invoke({
        "query": "Generate detailed Apple Q2 2023 financial analysis"
    })
    
    # Improved display function
    def display_results(response):
        print("="*80)
        print(response["result"])
        print("\n" + "="*80 + "\nSOURCE DOCUMENTS:")
        for i, doc in enumerate(response["source_documents"][:3]):  # Show top 3 sources
            print(f"\n[Document {i+1}] {doc.metadata['source']} (Page {doc.metadata.get('page','N/A')})")
            print("-"*50)
            print(doc.page_content[:500] + "...")
    
    display_results(response)

except Exception as e:
    print(f"Error: {str(e)}")
    print("Check your vectorstore connection and LLM configuration")

Device set to use cpu


Apple Inc.

SOURCE DOCUMENTS:

[Document 1] 10-Q-Q2-2023.pdf (Page 21)
--------------------------------------------------
financial reporting.
Apple Inc. | Q2 2023 Form 10-Q | 19...

[Document 2] 10-Q-Q2-2023.pdf (Page 21)
--------------------------------------------------
financial reporting.
Apple Inc. | Q2 2023 Form 10-Q | 19...

[Document 3] 10-Q-Q2-2023.pdf (Page 21)
--------------------------------------------------
financial reporting.
Apple Inc. | Q2 2023 Form 10-Q | 19...
