In [9]:
import os
import pandas as pd
# import lxml as lx
from lxml import html
from glob import glob
import re
import sys, os
import json
import subprocess

# Add the scripts/SEC_Documents folder to Python path
sys.path.append(os.path.abspath("scripts/SEC_Documents"))

from Deepseek_Integration import process_csv_with_deepseek

os.getcwd()

'/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/scripts/SEC_Documents'

In [10]:
def call_deepseek(prompt: str, model: str = "deepseek-r1:14b"):
    """
    Calls the DeepSeek model locally via Ollama with the given prompt.
    """
    try:
        result = subprocess.run(
            ["ollama", "run", model],
            input=prompt.encode("utf-8"),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=True
        )
        return result.stdout.decode("utf-8").strip()
    except subprocess.CalledProcessError as e:
        print("Error running DeepSeek:", e.stderr.decode("utf-8"))
        return None


In [11]:
# def extract_summary_table(text):
#     pattern = re.compile(
#         r"SUMMARY\s+COMPENSATION\s+TABLE[\s\S]*?</TABLE>",
#          re.DOTALL
#     )
#     match = pattern.search(text)
#     return match.group(0) if match else None


# import re

# def extract_summary_table(text):
#     """
#     Extracts a Summary Compensation Table only if the section contains
#     all three keywords: 'name', 'principal', and 'position'
#     (like the XPath validation logic).
#     """
#     # Step 1: Find candidate tables
#     pattern = re.compile(
#         r"SUMMARY\s+COMPENSATION\s+TABLE[\s\S]*?</TABLE>",
#         re.DOTALL
#     )
#     candidates = pattern.findall(text)

#     # Step 2: Cross-validate each candidate
#     valid_tables = []
#     for table in candidates:
#         # normalize to lowercase for simpler checking
#         t = table.lower()
#         if all(k in t for k in ["name", "principal", "position"]):
#             valid_tables.append(table)

#     # Step 3: Return the first valid one (or None)
#     return valid_tables[0] if valid_tables else None

# def extract_summary_table(text):
#     """
#     Extracts the correct Summary Compensation Table even if multiple
#     <TABLE> blocks appear after the heading.

#     Logic:
#     1. Find all <TABLE> blocks after 'SUMMARY COMPENSATION TABLE'
#     2. Validate each one for 'name', 'principal', and 'position' keywords
#     3. Return the first valid (most likely real) table
#     """
#     # Locate the "SUMMARY COMPENSATION TABLE" heading (strict uppercase)
#     header_match = re.search(r"SUMMARY\s+COMPENSATION\s+TABLE", text)
#     if not header_match:
#         return None

#     # Slice text from that point forward
#     section = text[header_match.end():]

#     # Find all <TABLE>...</TABLE> blocks after the header
#     tables = re.findall(r"<TABLE\b[^>]*>[\s\S]*?</TABLE>", section, re.IGNORECASE)

#     # Cross-validate each table
#     for tbl in tables:
#         print(f'Checking table: {tbl[:100]}...')  # Debug: show start of table
#         lower_tbl = tbl.lower()
#         if all(k in lower_tbl for k in ["name", "principal", "position"]):
#             # ✅ found the real Summary Compensation Table
#             return tbl

#     # If none pass validation, fall back to the first <TABLE>
#     return tables[0] if tables else None

In [12]:

# def extract_summary_table(text):
#     """
#     Extracts a Summary Compensation Table.
#     1. Prefers <TABLE> blocks after the heading.
#     2. Falls back to ASCII-style tables if no HTML tags exist.
#     """

#     # === Step 1: Find all HTML tables after the header ===
#     header = re.search(r"SUMMARY\s+COMPENSATION\s+TABLE", text)
#     if not header:
#         return None

#     section = text[header.end():]

#     html_tables = re.findall(r"<TABLE\b[^>]*>[\s\S]*?</TABLE>", section, re.IGNORECASE)

#     for tbl in html_tables:
#         lower_tbl = tbl.lower()
#         if all(k in lower_tbl for k in ["name", "principal", "position"]):
#             return tbl  # ✅ real table found

#     # === Step 2: ASCII fallback ===
#     # Capture everything that looks like the ASCII table block
#     ascii_pattern = re.compile(
#         r"SUMMARY\s+COMPENSATION\s+TABLE[\s\S]{0,1500}?"
#         r"(?:NAME.*?POSITION[\s\S]*?)(?:\n[-\s]*\n)?([\s\S]{300,5000})",
#         re.IGNORECASE,
#     )

#     ascii_match = ascii_pattern.search(text)
#     if ascii_match:
#         ascii_table = ascii_match.group(1)

#         # Trim the table at the first large blank or "(1)"-type note section
#         ascii_table = re.split(r"\n\s*\(\d+\)|\n\s{0,3}\(\w+\)", ascii_table)[0].strip()

#         # Ensure it contains at least one row with dollar amounts
#         if re.search(r"\$\d", ascii_table):
#             return ascii_table

#     return None

In [13]:
import re

def extract_summary_table(text: str) -> str | None:
    """
    Extract a Summary Compensation Table (SCT) snippet from a DEF 14A .txt payload.
    Strategy:
      1) Find 'summary compensation table' anchors (case-insensitive), take a line window.
      2) If an SGML/HTML table <TABLE>...</TABLE> exists in that window and contains
         'name', 'principal', 'position' (case-insensitive), return it.
      3) Otherwise, try to capture an ASCII/fixed-width block starting near
         'NAME AND PRINCIPAL POSITION' and return that snippet.
    Returns the first plausible snippet or None if not found.
    """
    # ---------- helpers ----------
    def _normalize(s: str) -> str:
        s = s.replace("\r\n", "\n").replace("\r", "\n")
        s = s.replace("\t", "    ")
        return "\n".join(line.rstrip() for line in s.splitlines())

    def _has_sct_keywords(snippet: str) -> bool:
        t = snippet.lower()
        return all(k in t for k in ("name", "principal", "position"))

    def _sgml_table_in(window: str) -> str | None:
        # Case-insensitive SGML/HTML table search within the anchor window
        m = re.search(r"<TABLE[\s\S]*?</TABLE>", window, re.IGNORECASE | re.DOTALL)
        if m:
            candidate = m.group(0)
            if _has_sct_keywords(candidate):
                return candidate
        return None

    def _is_heading(line: str) -> bool:
        # Heuristic: mostly-uppercase headings; avoid falsely catching the SCT header
        s = line.strip()
        if len(s) < 6:
            return False
        upp = sum(ch.isupper() for ch in s if ch.isalpha())
        letters = sum(ch.isalpha() for ch in s)
        ratio = (upp / letters) if letters else 0.0
        return (ratio > 0.7) and (not _has_sct_keywords(line))

    def _ascii_capture(lines: list[str], start_idx: int, window_end: int) -> str | None:
        # Find header line within ~20 lines of anchor (allow wrap to next line)
        hdr_idx = None
        for j in range(start_idx, min(start_idx + 20, window_end)):
            cur = lines[j].lower()
            nxt = (lines[j + 1].lower() if j + 1 < window_end else "")
            both = (cur + " " + nxt)
            if (("name" in cur and "principal" in cur and "position" in cur) or
                ("name" in nxt and "principal" in nxt and "position" in nxt) or
                ("name" in both and "principal" in both and "position" in both)):
                hdr_idx = j
                break
        if hdr_idx is None:
            return None

        # Capture from a couple lines above header to the end of the block
        start_cap = max(hdr_idx - 2, 0)
        out_lines, blanks = [], 0
        for k in range(start_cap, window_end):
            line = lines[k]
            out_lines.append(line)
            if not line.strip():
                blanks += 1
            else:
                blanks = 0
            # Stop on two consecutive blanks after some content
            if blanks >= 2 and len(out_lines) > 6:
                break
            # Or if we see a new heading after enough lines
            if len(out_lines) > 10 and _is_heading(line):
                break

        snippet = "\n".join(out_lines).strip()
        return snippet if _has_sct_keywords(snippet) else None

    # ---------- main ----------
    txt = _normalize(text)
    lines = txt.splitlines()

    # Find anchor lines containing "summary compensation table" (case-insensitive)
    anchor_idxs = [i for i, ln in enumerate(lines)
                   if ("summary" in ln.lower() and "compensation" in ln.lower() and "table" in ln.lower())]
    if not anchor_idxs:
        return None

    # For each anchor: try SGML first, then ASCII
    for ai in anchor_idxs:
        win_start = max(0, ai - 10)
        win_end = min(len(lines), ai + 120)
        window_text = "\n".join(lines[win_start:win_end])

        sgml = _sgml_table_in(window_text)
        if sgml:
            return sgml

        ascii_snippet = _ascii_capture(lines, ai, win_end)
        if ascii_snippet:
            return ascii_snippet

    return None

# Optional helper for quick file testing:
def extract_summary_table_from_file(path: str, encoding: str = "utf-8") -> str | None:
    try:
        with open(path, "r", encoding=encoding, errors="ignore") as f:
            return extract_summary_table(f.read())
    except Exception:
        return None

# Example usage (uncomment to test):
# p = "data/ACU/DEF_14A/2008-03-12_DEF_14A.txt"
# snippet = extract_summary_table_from_file(p)
# print(snippet if snippet else "No SCT found")


In [14]:
folder_path = "/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/*/DEF_14A"

files_name = glob(os.path.join(folder_path, "*.txt"))
print(f"found {len(files_name)} files")
print(f'{files_name}')


found 188 files
['/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/ATRI/DEF_14A/2006-04-06_DEF_14A.txt', '/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/ATRI/DEF_14A/1999-05-07_DEF_14A.txt', '/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/ATRI/DEF_14A/2005-04-06_DEF_14A.txt', '/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/ATRI/DEF_14A/1995-03-31_DEF_14A.txt', '/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/ATRI/DEF_14A/1998-04-08_DEF_14A.txt', '/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/ATRI/DEF_14A/2000-03-31_DEF_14A.txt', '/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/ATRI/DEF_14A/2004-04-08_DEF_14A.txt', '/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/da

In [None]:
# for file in files_name:
#     with open(file, 'r', encoding='utf-8') as f:
#         content = f.read()
#         print(f"Processing file: {file}")

#         for match in re.finditer(r"SUMMARY\s+COMPENSATION\s+TABLE", content):
#             if match:
#                 start = match.end()
#                 following_text = content[start:start + 500]  # Get next 500 characters
#                 print(f"Following text: {following_text}")

Processing file: /Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/ATRI/DEF_14A/2006-04-06_DEF_14A.txt
Following text: 

- ----------
(1)  In accordance with the regulations of the Commission, this table does not
     include perquisites and other personal benefits received by Named Executive
     Officers since the value of perquisites and other benefits for each Named
     Executive Officer did not exceed the lesser of $50,000 or 10% of the total
     annual salary and bonus reported for such Named Executive Officer.

(2)  For Mr. Battat, options granted in 2004 represent both qualified and
     nonqualified s
Processing file: /Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/ATRI/DEF_14A/1999-05-07_DEF_14A.txt
Following text: 
   
<TABLE>
<CAPTION>
                                                       Long-Term Compensation
                           ANNUAL COMPENSATION (1)              Awards
           

In [15]:
for file in files_name:
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()
        print(f"Processing file: {file}")
      # Print the first 5000 characters of the file
        sct = extract_summary_table(content)
        print(sct)
        # prompt = f"Convert this SEC Summary Compensation Table into structured JSON:\n\n{sct}"

        # response = call_deepseek(prompt)   # uses your already-defined DeepSeek helper

        # print("\n=== DeepSeek raw response ===\n")
        # print(response)

        # # try to parse & preview
        # try:
        #     data = json.loads(response)
        #     print("\n=== Parsed JSON preview ===\n")
        #     print(json.dumps(data, indent=2)[:1000])
        # except json.JSONDecodeError:
        #     print("\n⚠️ Model output is not valid JSON (printed above).")
# tables = extract_all_tables(content)
# print(tables)

Processing file: /Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/ATRI/DEF_14A/2006-04-06_DEF_14A.txt
None
Processing file: /Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/ATRI/DEF_14A/1999-05-07_DEF_14A.txt
<TABLE>
<CAPTION>
                                                       Long-Term Compensation
                           ANNUAL COMPENSATION (1)              Awards
                           -----------------------     -----------------------
                                                                      Securities
     Name and                                          Restricted     Underlying       All Other
Principal Position      Year     Salary       Bonus   Stock Awards    Options(2)     Compensation
- --------------------    ----    --------    --------  ------------    -----------    ------------
<S>                     <C>     <C>         <C>       <C>             <C>            <C>
