# Task 1: 10-K Retrieval QA with RAG Pipeline

This notebook demonstrates a Retrieval-Augmented Generation (RAG) pipeline to answer questions about the latest SEC 10-K/10-Q filings for ten companies. It covers data ingestion, chunking, embedding, vector storage, and RetrievalQA using Gemini LLM. Answers are citation-aware and concise.

## 1. Imports and Random Seed Setup
All random seeds are fixed for reproducibility. Only public packages are used.

In [None]:
import os
import random
import numpy as np
import torch
import logging
from tqdm import tqdm
# Set random seeds
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## 2. Data Ingestion: Download Latest 10-K/10-Q Filings
We use `sec_downloader` to fetch the latest filings for 10 companies.

In [None]:
from sec_downloader import Downloader
companies = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA", "META", "NVDA", "BRK-B", "JNJ", "V"]
output_dir = "./data/10k_filings"
os.makedirs(output_dir, exist_ok=True)
downloader = Downloader(output_dir, email_address="ra.kuppasundarar@ufl.edu")
for ticker in companies:
    logging.info(f'Downloading 10-Q for {ticker}')
    try:
        metadatas = downloader.get_filing_metadatas(ticker)
        ten_qs = [m for m in metadatas if getattr(m, 'form_type', '').upper() == '10-Q']
        if not ten_qs:
            logging.error(f'No 10-Q filings found for {ticker}')
            continue
        ten_qs.sort(key=lambda m: getattr(m, 'filing_date', ''), reverse=True)
        accession_number = ten_qs[0].accession_number
        filing_url = ten_qs[0].primary_doc_url
        import requests
        headers = {"User-Agent": "ra.kuppasundarar@ufl.edu", "Accept-Encoding": "gzip, deflate", "Host": "www.sec.gov"}
        filing_response = requests.get(filing_url, headers=headers)
        if filing_response.status_code == 200:
            ticker_dir = os.path.join(output_dir, ticker)
            os.makedirs(ticker_dir, exist_ok=True)
            file_path = os.path.join(ticker_dir, f"{accession_number}.txt")
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(filing_response.text)
            logging.info(f'Downloaded and saved filing for {ticker} to {file_path}')
        else:
            logging.error(f'Failed to download filing for {ticker} from {filing_url} (status {filing_response.status_code})')
    except Exception as e:
        logging.error(f'Failed to download 10-Q for {ticker}: {e}')

## 3. Chunking Filings
We use `RecursiveCharacterTextSplitter` for chunking, with chunk size 2000 and overlap 200.

In [None]:
from langchain_community.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import glob
documents = []
for file_path in glob.glob(os.path.join(output_dir, "**/*.txt"), recursive=True):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
    doc = Document(page_content=text, metadata={'source': file_path})
    documents.append(doc)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)
logging.info(f'Number of document chunks: {len(chunks)}')

## 4. Embedding and Vector Store Creation
We use `sentence-transformers/all-mpnet-base-v2` and FAISS for vector storage.

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"}
)
faiss_dir = "./data/faiss_index_10k"
os.makedirs(faiss_dir, exist_ok=True)
vector_store = FAISS.from_documents(chunks, embeddings)
vector_store.save_local(os.path.join(faiss_dir, "final_index.faiss"))
logging.info(f'Final FAISS vector store saved to {faiss_dir}')

## 5. RetrievalQA Chain with Gemini LLM
We use Gemini LLM for answer generation. Answers are citation-aware and concise.

In [None]:
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
import toml
SECRETS_PATH = 'config/secrets.toml'
secrets = toml.load(SECRETS_PATH)
gemini_api_key = secrets.get('GEMINI_API_KEY')
os.environ['GEMINI_API_KEY'] = gemini_api_key
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash', google_api_key=gemini_api_key)
vector_store = FAISS.load_local(os.path.join(faiss_dir, 'final_index.faiss'), embeddings, allow_dangerous_deserialization=True)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vector_store.as_retriever(),
    return_source_documents=True
)

## 6. Demo Q&A: Company Questions with Citations
For each company, we answer two demo questions and cite the source.

In [None]:
COMPANIES = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA", "META", "NVDA", "BRK-B", "JNJ", "V"]
TICKER_TO_NAME = {
    "AAPL": "Apple",
    "MSFT": "Microsoft",
    "GOOGL": "Google",
    "AMZN": "Amazon",
    "TSLA": "Tesla",
    "META": "Meta",
    "NVDA": "Nvidia",
    "BRK-B": "Berkshire Hathaway",
    "JNJ": "Johnson & Johnson",
    "V": "Visa"
}
DEMO_QUESTIONS = [
    'What does [Company] list as its three primary sources of revenue?',
    'Summarize the biggest risk [Company] cites about supply chain concentration.'
]
def get_company_retriever(vector_store, company):
    def company_filter(metadata):
        src = metadata.get('source', '').replace('\\', '/').replace('\', '/')
        return f'/{company}/' in src
    return vector_store.as_retriever(search_kwargs={'filter': company_filter})
def format_answer(answer, sources):
    output = f'A: {answer}\n'
    if sources:
        seen = set()
        output += 'Citations:\n'
        for doc in sources:
            src = doc.metadata.get('source', 'N/A')
            if src not in seen:
                output += f'  [Source]: {src}\n'
                seen.add(src)
    else:
        output += '[No source documents retrieved.]\n'
    return output
for company in COMPANIES:
    display_name = TICKER_TO_NAME[company]
    print(f'\n=== {company} ===')
    for question in DEMO_QUESTIONS:
        q = question.replace('[Company]', display_name)
        retriever = get_company_retriever(vector_store, company)
        qa_chain.retriever = retriever
        result = qa_chain.invoke({'query': q})
        answer = result.get('result', '[No answer returned]')
        sources = result.get('source_documents', [])
        print(f'Q: {q}')
        print(format_answer(answer, sources))

## 7. Reflection
- Present in separate markdown file Task 1/reflection.md