In [6]:
import re
import json
from pypdf import PdfReader

In [7]:
with open('asset-type-codes.json', 'r') as f:
  data = json.load(f)
keys = list(data.keys())
print(keys)

['4K', '5C', '5F', '5P', 'AB', 'BA', 'BK', 'CO', 'CS', 'CT', 'DB', 'DO', 'DS', 'EF', 'EQ', 'ET', 'FA', 'FE', 'FN', 'FU', 'GS', 'HE', 'HN', 'IC', 'IH', 'IP', 'IR', 'MA', 'MF', 'MO', 'OI', 'OL', 'OP', 'OT', 'PE', 'PM', 'PS', 'RE', 'RP', 'RS', 'SA', 'ST', 'TR', 'VA', 'VI', 'WU']


In [8]:
reader = PdfReader("20026063.pdf")
num_pages = len(reader.pages)

In [9]:
page = reader.pages[0]
text = page.extract_text()

In [10]:
lines = text.splitlines()
lines = [x.replace('\x00', '') for x in lines]

endTHeadPattern = r'\$\d+(?:\.\d+)*'
for i, line in enumerate(lines):
  if re.search(endTHeadPattern, line):
        lines = lines[i + 1:]
        break

In [18]:
endOfTxPattern = r'^F S: (.*)$'
date_pattern = r"(\d{2}/\d{2}/\d{4})"
description_pattern = r'D:\s*'
tx = ""
txs = []
for i, line in enumerate(lines):
  if re.search(endOfTxPattern, line):
    txs.append(tx)
    tx = ""
    continue
  if not re.search(description_pattern, line):
    tx += line
print(txs)

['SP Apple Inc. - Common Stock (AAPL)[ST]P 10/07/202410/22/2024$15,001 -$50,000', 'SP GE Vernova Inc. Common Stock(GEV) [ST]S 11/08/202411/12/2024 $15,001 -$50,000', 'SP Pfizer, Inc. Common Stock (PFE) [ST]P 10/21/202410/23/2024$15,001 -$50,000', 'SP TuHURA Biosciences, Inc. - CommonStock (HURA) [ST]E 10/18/202411/12/2024 $100,001 -$250,000']


In [25]:
owner_pattern = r'^(DC|JT|SP)\s'
asset_class_pattern = r'\[(.*?)\]'
transaction_pattern = r"^(P|S)"
date_pattern = r"(\d{2}/\d{2}/\d{4})"
amount_pattern = r"(\$\d{1,3}(,\d{3})*(\.\d{2})?( - \$\d{1,3}(,\d{3})*(\.\d{2})?)?)"

transactions = []

for tx in txs:
  asset = re.search(asset_class_pattern, tx)
  if not asset:
    continue
  owner = re.search(owner_pattern, tx)
  if owner.group(0):
    tx = tx.split(owner.group(0))[1:][0]
  asset_class = asset.group(1)
  stock = tx.split(f"[{asset_class}]")[0]
  tx = tx.split(f"[{asset_class}]")[1:][0]
  if tx[0] == " ":
    tx = tx[1:]
  orderType = re.search(transaction_pattern, tx).group(0)
  dates = re.findall(date_pattern, tx)
  amount = re.search(amount_pattern, tx).group(0)
  tx = {
    "Stock": stock,
    "Class": asset_class,
    "Type": orderType,
    "Bought": dates[0],
    "Filed": dates[1],
    "Amount": amount
  }
  transactions.append(tx)

print(transactions)

[{'Stock': 'Baxter International Inc. CommonStock (BAX) ', 'Class': 'ST', 'Type': 'P', 'Bought': '11/18/2024', 'Filed': '11/19/2024', 'Amount': '$1,001 - $15,000'}, {'Stock': 'New Gold Inc. (NGD) ', 'Class': 'ST', 'Type': 'S', 'Bought': '11/18/2024', 'Filed': '11/19/2024', 'Amount': '$1,001 - $15,000'}, {'Stock': 'Patterson-UTI Energy, Inc. - CommonStock (PTEN) ', 'Class': 'ST', 'Type': 'P', 'Bought': '10/23/2024', 'Filed': '10/23/2024', 'Amount': '$1,001 - $15,000'}, {'Stock': 'ViaSat, Inc. - Common Stock (VSAT)', 'Class': 'ST', 'Type': 'P', 'Bought': '10/23/2024', 'Filed': '10/23/2024', 'Amount': '$1,001 - $15,000'}]
