In [14]:
import re
import os
import json
import boto3
import requests
import chromadb
from bs4 import BeautifulSoup
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain_aws import BedrockEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
os.environ['TAVILY_API_KEY'] = "YOUR-API_KEY"
os.environ['GROQ_API_KEY'] = "YOUR-API_KEY"
os.environ['AWS_ACCESS_KEY_ID'] = "YOUR-API_KEY"
os.environ['AWS_SECRET_ACCESS_KEY'] = "YOUR-API_KEY"
os.environ['AWS_DEFAULT_REGION'] = 'us-west-2'

In [None]:
# getting all links for a given company CIK from SEC website  
def SEC_filing_URLS(CIK):
  BASE_URL = f"https://data.sec.gov/submissions/CIK{CIK}.json"
  HEADERS = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
  }
  skip_filings = ["4", "NO ACT", "UPLOAD", "CORRESP", "IRANNOTICE", "CERTNYS"]
  filing_dict = {}
  response = requests.get(BASE_URL, headers=HEADERS)

  if response.status_code == 200:
      data = response.json()
      filings = data.get("filings", {}).get("recent", {})

      filing_counts = {}  

      for i in range(len(filings.get("accessionNumber", []))):
          filing_type = filings["form"][i]

          if filing_type in skip_filings:
              continue  

          if filing_type not in filing_counts:
              filing_counts[filing_type] = 0

          if filing_counts[filing_type] >= 1:
              continue  

          filing_counts[filing_type] += 1
          filing_link = f'https://www.sec.gov/Archives/edgar/data/{CIK}/{filings["accessionNumber"][i].replace("-", "")}/{filings["primaryDocument"][i]}'
          if filing_type not in filing_dict:
              filing_dict[filing_type] = []

          filing_dict[filing_type].append(filing_link)
      return filing_dict
  else:
      print("Failed to retrieve data from SEC EDGAR API")

In [17]:
# Apple Inc. CIK
CIK = "0000320193"
SEC_filing_URLS = SEC_filing_URLS(CIK)

In [18]:
SEC_filing_URLS

{'PX14A6G': ['https://www.sec.gov/Archives/edgar/data/0000320193/000109690625000152/bowy_px14a6g.htm'],
 '144': ['https://www.sec.gov/Archives/edgar/data/0000320193/000192109425000087/xsl144X01/primary_doc.xml'],
 '10-Q': ['https://www.sec.gov/Archives/edgar/data/0000320193/000032019325000008/aapl-20241228.htm'],
 '8-K': ['https://www.sec.gov/Archives/edgar/data/0000320193/000032019325000007/aapl-20250130.htm'],
 '3': ['https://www.sec.gov/Archives/edgar/data/0000320193/000032019325000002/xslF345X02/wk-form3_1736551805.xml'],
 'DEFA14A': ['https://www.sec.gov/Archives/edgar/data/0000320193/000130817925000009/aapl4359751-defa14a.htm'],
 'DEF 14A': ['https://www.sec.gov/Archives/edgar/data/0000320193/000130817925000008/aapl4359751-def14a.htm'],
 'S-3ASR': ['https://www.sec.gov/Archives/edgar/data/0000320193/000114036124044880/ny20033611x4_s3asr.htm'],
 '10-K': ['https://www.sec.gov/Archives/edgar/data/0000320193/000032019324000123/aapl-20240928.htm'],
 '5': ['https://www.sec.gov/Archives

In [19]:
filings_to_download = ['PX14A6G', '144', '10-Q', '8-K', '3', 'DEFA14A', 'DEF 14A', 'S-3ASR', '10-K', '5', 'SD', 'SC 13G/A', '25-NSE', '424B2', 'FWP', '4/A', 'S-8', 'S-8 POS', 'PX14A6N', 'CERT', '8-A12B', '3/A', '25', 'SC 13G', '8-K/A']


In [None]:
# getting links for tags to download corresponding data
def get_sec_filing_urls(a, SEC_filing_URLS):
    return [SEC_filing_URLS[tag][0] for tag in a if tag in SEC_filing_URLS and SEC_filing_URLS[tag]]


In [21]:
result = get_sec_filing_urls(filings_to_download, SEC_filing_URLS)

In [22]:
result

['https://www.sec.gov/Archives/edgar/data/0000320193/000109690625000152/bowy_px14a6g.htm',
 'https://www.sec.gov/Archives/edgar/data/0000320193/000192109425000087/xsl144X01/primary_doc.xml',
 'https://www.sec.gov/Archives/edgar/data/0000320193/000032019325000008/aapl-20241228.htm',
 'https://www.sec.gov/Archives/edgar/data/0000320193/000032019325000007/aapl-20250130.htm',
 'https://www.sec.gov/Archives/edgar/data/0000320193/000032019325000002/xslF345X02/wk-form3_1736551805.xml',
 'https://www.sec.gov/Archives/edgar/data/0000320193/000130817925000009/aapl4359751-defa14a.htm',
 'https://www.sec.gov/Archives/edgar/data/0000320193/000130817925000008/aapl4359751-def14a.htm',
 'https://www.sec.gov/Archives/edgar/data/0000320193/000114036124044880/ny20033611x4_s3asr.htm',
 'https://www.sec.gov/Archives/edgar/data/0000320193/000032019324000123/aapl-20240928.htm',
 'https://www.sec.gov/Archives/edgar/data/0000320193/000032019324000102/xslF345X05/wk-form5_1727822122.xml',
 'https://www.sec.gov/A

In [None]:
# Using amazon titan model to create embds
bedrock_client = boto3.client(
    service_name='bedrock-runtime',
    region_name='us-west-2'
)
bedrock_embeddings = BedrockEmbeddings(
    client=bedrock_client, region_name='us-west-2', model_id='amazon.titan-embed-text-v2:0'
)

In [None]:
# Creating collections and saving embds to it
for i in range(len(result)):
    s = str()
    headers = {
    "User-Agent": "PersonalScript/1.0 (your@email.com)"
    }
    filing_url = result[i]
    filing_tag = filings_to_download[i]
    combined_string = re.sub(r'[^a-zA-Z0-9_-]', '_', f"{CIK}_{filings_to_download[i]}")
    print(combined_string)

    filing_response = requests.get(filing_url, headers=headers)

    soup = BeautifulSoup(filing_response.text, "html.parser")
    for script_or_style in soup(["script", "style", "a"]):
        script_or_style.decompose()
    clean_text = soup.get_text(separator="\n", strip=True)
    cleaned_data = re.sub(r'[0-9$#@!*&^%$€\.\,;:<>_+=|\\/\[\]{}()-]', '', clean_text)
    # cleaned_data = "\n".join(line.strip() for line in cleaned_data.splitlines() if len(re.findall(r'\w+', line.strip())) > 3)
    docs = [Document(page_content=cleaned_data)]
    splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)
    chunked_docs = splitter.split_documents(docs)
    print(f"Total chunks created: {len(chunked_docs)}")
    
    batch_size = 166
    batches = [chunked_docs[i:i + batch_size] for i in range(0, len(chunked_docs), batch_size)]

    for batch in batches:
        chroma_db = Chroma.from_documents(documents=batch,
                                        collection_name=combined_string,
                                        embedding=bedrock_embeddings,
                                        persist_directory="./SEC_DATA")




0000320193_PX14A6G
Total chunks created: 7
0000320193_144
Total chunks created: 2
0000320193_10-Q
Total chunks created: 41
0000320193_8-K
Total chunks created: 4
0000320193_3
Total chunks created: 3
0000320193_DEFA14A
Total chunks created: 4
0000320193_DEF_14A
Total chunks created: 192
0000320193_S-3ASR
Total chunks created: 67
0000320193_10-K
Total chunks created: 126
0000320193_5
Total chunks created: 2
0000320193_SD
Total chunks created: 1
0000320193_SC_13G_A
Total chunks created: 24
0000320193_25-NSE
Total chunks created: 1
0000320193_424B2
Total chunks created: 107
0000320193_FWP
Total chunks created: 16
0000320193_4_A
Total chunks created: 4
0000320193_S-8
Total chunks created: 6
0000320193_S-8_POS
Total chunks created: 16
0000320193_PX14A6N
Total chunks created: 9
0000320193_CERT
Total chunks created: 59
0000320193_8-A12B
Total chunks created: 2
0000320193_3_A
Total chunks created: 2
0000320193_25
Total chunks created: 1
0000320193_SC_13G
Total chunks created: 32
0000320193_8-K_

In [None]:
# To print all collections in ChromaDb

# Initialize ChromaDB client
client = chromadb.PersistentClient(path="./SEC_DATA")
# List all collections
collections = client.list_collections()
# Print collection names
for collection in collections:
    print(collection.name)


0000320193_DEFA14A
0000320193_PX14A6G
0000320193_3
0000320193_FWP
0000320193_CERT
0000320193_8-K
0000320193_S-3ASR
0000320193_S-8
0000320193_10-Q
0000320193_25
0000320193_5
test_set_1
0000320193_DEF_14A
0000320193_4_A
0000320193_144
0000320193_1
0000320193_2
0000320193_424B2
0000320193_3_A
0000320193_0
0000320193_SC_13G
0000320193_10-K
0000320193_8-A12B
0000320193_SC_13G_A
0000320193_SD
0000320193_25-NSE
0000320193_S-8_POS
0000320193_PX14A6N
0000320193_8-K_A


In [None]:
# Method to extract data from ChromaDB

In [27]:
chroma_db = Chroma(persist_directory="./SEC_DATA",
                    collection_name="test_set_1",
                    embedding_function=bedrock_embeddings)

similarity_threshold_retriever = chroma_db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 2,"score_threshold": 0.2}
                                                        )
question = "Apple filing say about Earnings Per Share?"
top_docs = similarity_threshold_retriever.invoke(question)

No relevant docs were retrieved using the relevance score threshold 0.2
