In [1]:
import re
import json
import pandas
import io

from urllib.request import urlopen
from pypdf import PdfReader

In [2]:
DOCID = "20024468"
YEAR = 2024
DOCURL = f"https://disclosures-clerk.house.gov/public_disc/ptr-pdfs/{YEAR}/{DOCID}.pdf"
ERROR_HOOK = "https://discord.com/api/webhooks/1316932982417657989/O7QMv2k9dmf0PsfViAAw3jjJmX25_IYEHqrPCpcryniqigQC0bxsmBQA7h2Bc4qxobG4"

In [3]:
# cleaning the document
thead_pattern = r'\$\d+(?:\.\d+)*'
endTx_pattern = r'^F S: (.*)$'
description_pattern = r'D:\s*'

# parse the tx
asset_class_pattern = r'\[(.*?)\]'
owner_pattern = r'^(DC|JT|SP)\s'
stockCode = "ST"
ticker_pattern = r'\([A-Z]{1,5}\)'
tx_pattern = r"^(P|S|E)"
date_pattern = r'(\d{2}/\d{2}/\d{4})'
amount_pattern = r'\$(\d{1,3}(,\d{3})*(\.\d{2})?)\s*-\s*\$(\d{1,3}(,\d{3})*(\.\d{2})?)|over\s*\$50,000,000'

# edge cases
filingInTx = "Filing ID #"
# orderTypeFcked = r"[A-Za-z]+[PSE](?=\s\d)"
orderTypeFcked = r"[A-Za-z]+[PSE](?=\s\d)|\)?[PSE](?=\s\d)"
# orderTypeFckedParse = r"(\w+)([PSE])"
orderTypeFckedParse = r"([A-Za-z]*|[^\w\s]*)([PSE])"
fckedTHead = r"\s*ID Owner Asset Transaction\s*"
subHolding = "S O: "

In [4]:
# parse docs into lines
with urlopen(DOCURL) as response:
  pdf_bytes = io.BytesIO(response.read())
  reader = PdfReader(pdf_bytes)

text = ""
for x in range(len(reader.pages)):
  page = reader.pages[x]
  text += page.extract_text()
lines = text.splitlines()
lines = [x.replace('\x00', '') for x in lines]

In [5]:
# isolate txs
for i, line in enumerate(lines):
  # remove everything through the first thead
  if re.search(thead_pattern, line):
    lines = lines[i + 1:]
    break

# in case of any more table headers
headers = ["Type", "Date Notification", "Date", "Amount Cap.", "Gains >", "$200?"]
for header in headers:
  while header in lines:
    lines.remove(header)
  # if header in lines:
  #   lines.remove(header)

tx, txs = "", []
for i, line in enumerate(lines):
  if re.search(endTx_pattern, line):
    tx = tx.replace("ID Owner Asset Transaction", "")
    txs.append(tx)
    tx = ""
    continue
  if not re.search(description_pattern, line):
    if "Apple Inc." in line:
      print("HERE")
    if not filingInTx in line:
      match = re.search(orderTypeFcked, tx)
      if match:
        prefix, suffix = tx[:match.start()], tx[match.end():]
        match_str = match.group()
        match = re.match(orderTypeFckedParse, match_str)
        word, orderType = match.group(1), match.group(2)
        if word == "":
          tx += line
        else:
          tx = prefix + word + " " + line + orderType + suffix
      else:
        if not subHolding in line:
          tx += line
print(txs)

HERE
HERE
['Alphabet Inc. - Class A CommonStock (GOOGL) [ST]S (partial) 01/25/202402/01/2024$1,001 - $15,000', 'Alphabet Inc. - Class A CommonStock (GOOGL) [ST]S (partial) 01/25/202402/01/2024$1,001 - $15,000', 'Alphabet Inc. - Class C Capital Stock(GOOG) [ST]S (partial) 01/25/202402/01/2024$1,001 - $15,000', 'Alphabet Inc. - Class C Capital Stock(GOOG) [ST]S (partial) 01/25/202402/01/2024$1,001 - $15,000', 'Amazon.com, Inc. - Common Stock(AMZN) [ST]S (partial) 01/25/202402/01/2024$1,001 - $15,000', 'Amazon.com, Inc. - Common Stock(AMZN) [ST]S (partial) 01/25/202402/01/2024$1,001 - $15,000', 'Apple Inc. - Common Stock (AAPL)[ST]S (partial) 01/25/202402/01/2024$15,001 -$50,000', 'Apple Inc. - Common Stock (AAPL)S (partial) 01/25/202402/01/2024$15,001 -[ST] $50,000', 'Berkshire Hathaway Inc. NewCommon Stock (BRK.B) [ST]S (partial) 01/25/202402/01/2024$1,001 - $15,000', 'Berkshire Hathaway Inc. NewCommon Stock (BRK.B) [ST]S (partial) 01/25/202402/01/2024$1,001 - $15,000', 'Bitcoin [CT] S 

In [335]:
# parse the txs
transactions = []
for tx in txs:
  asset = re.search(asset_class_pattern, tx)
  if not asset:
    continue

  owner = re.search(owner_pattern, tx)
  if owner and owner.group(0):
    # tx = tx.split(owner.group(0))[1:][0]
    tx = tx.replace(owner.group(0), "")
  
  asset_class = asset.group(1)
  if asset_class != stockCode:
    break
  
  stock = tx.split(f"[{asset_class}]")[0]
  ticker = re.search(ticker_pattern, stock)
  if ticker:
    ticker = ticker.group()[1:-1]
  tx = tx.replace(f"{stock}", "")
  tx = tx.replace(f"[{asset_class}]", "")
  # tx = tx.split(f"[{asset_class}]")[1:][0]

  # sometimes the first char is a space and causes error so remove it
  if tx[0] == " ":
    tx = tx[1:]
  
  orderType = re.search(tx_pattern, tx)
  if orderType:
    orderType = orderType.group(0)
  
  dates = re.findall(date_pattern, tx)
  amount = re.search(amount_pattern, tx).group(0)

  # validate the fields
  tx = {
    "Stock": stock,
    "Ticker": ticker,
    "Class": asset_class,
    "Type": orderType,
    "Bought": dates[0],
    "Filed": dates[1],
    "Amount": amount
  }
  transactions.append(tx)
print(transactions)

[{'Stock': 'BP p.l.c. Common Stock (BP) ', 'Ticker': 'BP', 'Class': 'ST', 'Type': 'S', 'Bought': '01/08/2024', 'Filed': '02/05/2024', 'Amount': '$15,001 -$50,000'}, {'Stock': 'Cardinal Health, Inc. Common Stock(CAH) ', 'Ticker': 'CAH', 'Class': 'ST', 'Type': 'P', 'Bought': '01/12/2024', 'Filed': '02/05/2024', 'Amount': '$50,001 -$100,000'}, {'Stock': 'Intel Corporation - Common Stock(INTC) ', 'Ticker': 'INTC', 'Class': 'ST', 'Type': 'P', 'Bought': '01/08/2024', 'Filed': '02/05/2024', 'Amount': '$15,001 -$50,000'}, {'Stock': 'Lam Research Corporation - CommonStock (LRCX) ', 'Ticker': 'LRCX', 'Class': 'ST', 'Type': 'P', 'Bought': '01/19/2024', 'Filed': '02/05/2024', 'Amount': '$15,001 -$50,000'}, {'Stock': 'Regeneron Pharmaceuticals, Inc. -Common Stock (REGN) ', 'Ticker': 'REGN', 'Class': 'ST', 'Type': 'P', 'Bought': '01/17/2024', 'Filed': '02/05/2024', 'Amount': '$1,001 - $15,000'}, {'Stock': 'Wayfair Inc. Class A Common Stock(W) ', 'Ticker': 'W', 'Class': 'ST', 'Type': 'S', 'Bought': '