Task 1

In [None]:
from sec_edgar_downloader import Downloader

dl = Downloader(email_address="svyoma0604@gmail.com", company_name="Uppsala Student")
tickers = ["AAPL", "MSFT", "GOOG", "AMZN", "META", "TSLA", "NVDA", "V", "JPM", "JNJ"]

for ticker in tickers:
    dl.get("10-K", ticker, limit=1)

In [37]:
import re
import glob

filings_dir = "./sec-edgar-filings"
tickers = ["AAPL", "MSFT", "GOOG", "AMZN", "META", "TSLA", "NVDA", "V", "JPM", "JNJ"]
parsed_docs = []

def extract_10k_text(file_content):
    """Extracts <TEXT> section from the 10-K document block"""
    # Split into <DOCUMENT> blocks
    docs = re.findall(r"<DOCUMENT>(.*?)</DOCUMENT>", file_content, re.DOTALL | re.IGNORECASE)
    for doc in docs:
        doc_type_match = re.search(r"<TYPE>\s*10-K", doc, re.IGNORECASE)
        if doc_type_match:
            text_match = re.search(r"<TEXT>(.*?)</TEXT>", doc, re.DOTALL | re.IGNORECASE)
            if text_match:
                raw_text = text_match.group(1)
                return raw_text
    return None

def clean_text(text):
    """Remove HTML tags and extra spaces"""
    # Optional: remove all HTML tags if it's HTML
    text = re.sub(r"<[^>]+>", " ", text)  # crude tag stripper
    text = re.sub(r"\s+", " ", text)      # collapse whitespace
    return text.strip()

for ticker in tickers:
    try:
        glob_path = f"{filings_dir}/{ticker}/10-K/0*/full-submission.txt"
        matches = glob.glob(glob_path)
        if not matches:
            print(f"❌ No 10-K filing found for {ticker}")
            continue

        file_path = matches[0]
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            file_content = f.read()

        extracted = extract_10k_text(file_content)
        if not extracted:
            print(f"⚠️ 10-K <TEXT> block not found for {ticker}")
            continue

        cleaned = clean_text(extracted)
        parsed_docs.append({"ticker": ticker, "text": cleaned})
        print(f"✅ Parsed {ticker}: {len(cleaned)} characters")

    except Exception as e:
        print(f"💥 Error parsing {ticker}: {e}")


✅ Parsed AAPL: 230963 characters
✅ Parsed MSFT: 462372 characters
✅ Parsed GOOG: 396386 characters
✅ Parsed AMZN: 322545 characters
✅ Parsed META: 536345 characters
✅ Parsed TSLA: 430601 characters
✅ Parsed NVDA: 381765 characters
✅ Parsed V: 452899 characters
✅ Parsed JPM: 1543510 characters
✅ Parsed JNJ: 535435 characters


In [42]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

docs = []

for item in parsed_docs:
    chunks = text_splitter.create_documents([item["text"]], metadatas=[{"source": item["ticker"]}])
    docs.extend(chunks)


In [67]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [68]:
from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(docs, embeddings)

In [69]:
vectorstore.save_local("faiss_index_10k")

In [76]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})  # adjust `k` based on recall vs. precision

In [88]:
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

llm_pipeline = pipeline("summarization", model="t5-base", device=0, max_new_tokens=512)
llm = HuggingFacePipeline(pipeline=llm_pipeline)

Device set to use cuda:0


In [89]:
from langchain.chains import RetrievalQA
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(llm)
compressed_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=compressed_retriever,
    return_source_documents=True  # for cite-aware answers
)


In [91]:
query = "What does google list as its three primary sources of revenue?"
query_2 = "Summarize the biggest risk Google cites about supply chain concentration."
result = qa_chain.invoke({"query": query_2})

print("Answer:", result["result"])
for doc in result["source_documents"]:
    print(f"\nSource: {doc.metadata['source']}\nChunk:\n{doc.page_content[:300]}...")

Answer: disasters could affect demand for the Company&#8217;s products and services . armed conflicts and terrorist attacks could affect u.s. and other parts of world . failure to maintain and enhance our brands could harm our business, reputation .

Source: JNJ
Chunk:
remember, *DO NOT* edit the extracted parts of the context . disasters could affect demand for the Company&#8217;s products and services . armed conflicts and terrorist attacks could affect u.s. and other parts of world ....

Source: NVDA
Chunk:
extract any part of context *AS IS* that is relevant to answer the question . if none of the context is relevant return NO_OUTPUT . other risks, trends and uncertainties may also harm our business ....

Source: GOOG
Chunk:
extract any part of context *AS IS* that is relevant to answer the question . if none of the context is relevant return NO_OUTPUT . failure to maintain and enhance our brands could harm our business, reputation ....
