In [117]:
import os
import pandas as pd
import lxml as lx
# from lxml import html
from glob import glob

import sys, os

# Add the scripts/SEC_Documents folder to Python path
sys.path.append(os.path.abspath("scripts/SEC_Documents"))

from Deepseek_Integration import process_csv_with_deepseek

os.getcwd()

'/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/scripts/SEC_Documents'

In [118]:
folder_path = "/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/MSFT"

files_name = glob(os.path.join(folder_path, "2024-10-24_DEF14A.html"))
print(f"found {len(files_name)} files")
print(files_name)

for file in files_name[:1]:
    with open(file, "r") as f:
        content = f.read()
        try :
            tree = lx.html.fromstring(content)
            xpath_expr = """
        //tr[
            .//text()[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'name')]
            and .//text()[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'principal')]
            and .//text()[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'position')]
        ]
        """
            tr_node = tree.xpath(xpath_expr)
            print(f"Found {len(tr_node)} matching <tr> nodes in {file}")
            if tr_node:
                table = tr_node[0].getparent()
                # print(f"Table: {lx.html.tostring(table, pretty_print=True).decode('utf-8')}")
                while table is not None and table.tag != 'table':
                    table = table.getparent()
                print(f"Table: {lx.html.tostring(table, pretty_print=True).decode('utf-8')}")

        except Exception as e:
            print(f"Error parsing {file}: {e}")


found 1 files
['/Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/MSFT/2024-10-24_DEF14A.html']
Error parsing /Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/MSFT/2024-10-24_DEF14A.html: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.


In [119]:
# Compare types
print("Type of table:", type(table))
print("Type of lxml.html.tostring(table):", type(lx.html.tostring(table)))

# Show what they look like
print("\nPrinting table directly:")
print(table)   # lxml object representation

print("\nPrinting tostring(table):")
print(lx.html.tostring(table)[:300])  # raw HTML string (first 300 chars)

# Try parsing with pandas

Type of table: <class 'lxml.html.HtmlElement'>
Type of lxml.html.tostring(table): <class 'bytes'>

Printing table directly:
<Element table at 0x10f331590>

Printing tostring(table):
b'<table align="center" cellpadding="0" cellspacing="0" width="100%">\n    <tr>\n     <th align="left" nowrap rowspan="2" valign="bottom" width="41%">\n      <font face="\'Times New Roman\', Times" size="1">\n       <b>\n        Name and Principal\n     Position\n       </b>\n      </font>\n      <hr align="left'


In [120]:
df = pd.read_html(lx.html.tostring(table))[0]
print("\nDataFrame head:")
print(df)


DataFrame head:
                 Name and Principal  Position Unnamed: 1_level_0  Year         \
                 Name and Principal  Position Unnamed: 1_level_1  Year Year.1   
0                            William H. Gates                NaN  1999    NaN   
1                      Chairman of the Board;                NaN  1998    NaN   
2           Chief Executive Officer; Director                NaN  1997    NaN   
3                           Steven A. Ballmer                NaN  1999    NaN   
4                                   President                NaN  1998    NaN   
5                                         NaN                NaN  1997    NaN   
6                           Robert J. Herbold                NaN  1999    NaN   
7                   Executive Vice President;                NaN  1998    NaN   
8                     Chief Operating Officer                NaN  1997    NaN   
9                              Paul A. Maritz                NaN  1999    NaN   
10         

In [121]:
df.columns = [
    " ".join([str(x) for x in col if x and "Unnamed" not in str(x)]).strip()
    for col in df.columns.values
]

df = df.applymap(lambda x: str(x).strip() if pd.notna(x) else x)

print("\nCleaned DataFrame head:")
print(df)


Cleaned DataFrame head:
   Name and Principal  Position Name and Principal  Position     Year Year  \
0                                    William H. Gates        NaN      1999   
1                              Chairman of the Board;        NaN      1998   
2                   Chief Executive Officer; Director        NaN      1997   
3                                   Steven A. Ballmer        NaN      1999   
4                                           President        NaN      1998   
5                                                 NaN        NaN      1997   
6                                   Robert J. Herbold        NaN      1999   
7                           Executive Vice President;        NaN      1998   
8                             Chief Operating Officer        NaN      1997   
9                                      Paul A. Maritz        NaN      1999   
10                              Group Vice President,        NaN      1998   
11                                     

  df = df.applymap(lambda x: str(x).strip() if pd.notna(x) else x)


In [122]:
df = df.dropna(axis=1, how='all')  # drop columns where all elements are NaN

print(df)

   Name and Principal  Position Name and Principal  Position Year Year  \
0                                    William H. Gates             1999   
1                              Chairman of the Board;             1998   
2                   Chief Executive Officer; Director             1997   
3                                   Steven A. Ballmer             1999   
4                                           President             1998   
5                                                 NaN             1997   
6                                   Robert J. Herbold             1999   
7                           Executive Vice President;             1998   
8                             Chief Operating Officer             1997   
9                                      Paul A. Maritz             1999   
10                              Group Vice President,             1998   
11                                          Developer             1997   
12                                    

In [123]:
output_dir = os.path.join(folder_path, "extracted_jupiter_data")
os.makedirs(output_dir, exist_ok=True)  # create folder if it doesn't exist

output_path = os.path.join(output_dir, "2024_cleaned.csv")
df.to_csv(output_path, index=False)

print(f"Saved cleaned table to {output_path}")

Saved cleaned table to /Users/ruturaj_vasant/Desktop/PersonalProjects/Political-Economy-Of-Corporate-Fraud/data/MSFT/extracted_jupiter_data/2024_cleaned.csv


In [124]:
# Path to your saved cleaned CSV
output_path = os.path.join(folder_path, "extracted_jupiter_data", "1999_cleaned.csv")

# Call DeepSeek on it
process_csv_with_deepseek(output_path, "Convert this SEC Summary Compensation Table into structured JSON")

KeyboardInterrupt: 