In [8]:
from sec_edgar_downloader import Downloader
import pandas as pd
import os
from bs4 import BeautifulSoup
import re

In [9]:
def download_10k_filings(ticker):
    if not os.path.isdir(f"./sec-edgar-filings/{ticker}"):
        # dl = Downloader(ticker, "ramorstem2022@gmail.com")
        dl = Downloader(ticker, "ramor12@umd.edu")

        # Download all 10-K filings for the given ticker symbol.
        dl.get("10-K", ticker)
        return True
    return False

In [10]:
def extract_text(raw_text):
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_text)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_text)]
    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_text)]
    document = {}

    if len(doc_start_is) == 0:
        return None, None

    # Create a loop to go through each section type and save only the 10-K section in the dictionary
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            document[doc_type] = raw_text[doc_start:doc_end]
    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

    date_match = re.search(r'FILED AS OF DATE:\s+(\d{8})', raw_text)
    filing_date = "-1"

    # Extract the date string
    if date_match:
        filing_date = date_match.group(1)
        filing_date = pd.to_datetime(filing_date)

    # print(filing_date)

    matches = regex.finditer(document['10-K'])

    # Create the dataframe
    filing_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    filing_df.columns = ['item', 'start', 'end']
    filing_df['item'] = filing_df.item.str.lower()

    # Get rid of unnesesary charcters from the dataframe
    filing_df.replace('&#160;',' ',regex=True,inplace=True)
    filing_df.replace('&nbsp;',' ',regex=True,inplace=True)
    filing_df.replace(' ','',regex=True,inplace=True)
    filing_df.replace('\.','',regex=True,inplace=True)
    filing_df.replace('>','',regex=True,inplace=True)

    pos_dat = filing_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
    pos_dat.set_index('item', inplace=True)

    item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]
    # item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]
    # item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]

    item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')
    risk_factors_text = item_1a_content.get_text()
    
    risk_factors_text = risk_factors_text.replace("\n", "")
    risk_factors_text = risk_factors_text.replace(">Item 1A", "")
    risk_factors_text = risk_factors_text.replace(">Item 1A.", "")
    risk_factors_text = risk_factors_text.strip(" ")
    risk_factors_text = risk_factors_text.strip(".")
    risk_factors_text = risk_factors_text.strip(" ")
    risk_factors_text = risk_factors_text.replace("Risk Factors", "Risk Factors\n")
    return filing_date, risk_factors_text

def process_10k_filings(ticker, company_name, base_directory):
    filings_df = pd.DataFrame(columns=["Ticker", "Company Name", "Fill Date", "Risk Factors Text"])

    # Walk through the directory structure
    for root, dirs, files in os.walk(base_directory):
        for file in files:
            try:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                        filing_date, risk_section = extract_text(content)
                        if risk_section:                            
                            filings_df = pd.concat([filings_df, pd.DataFrame.from_records([{
                                "Ticker": ticker,
                                "Company Name": company_name,
                                "Fill Date": filing_date,
                                "Risk Factors Text": risk_section
                            }])], ignore_index=True)
            except:
                pass

    return filings_df

# ticker = "AAPL"
# # download_10k_filings("MSFT")
# # Specify the directory where your 10-K filings are stored
# base_directory = f'./sec-edgar-filings/{ticker}/10-K'

# output_directory = f'{ticker}.csv'
# company_name = "Apple Inc."
# output_df = process_10k_filings(ticker, company_name, base_directory)
# output_df = output_df.sort_values(by="Fill Date", ascending=False)
# output_df.to_csv(output_directory)

# output_df

In [11]:
companies = pd.read_csv("companies.csv")
companies

Unnamed: 0,company name,gvkey,NAICS,SIC,ticker,industry sector,weburl,ff5 sector
0,AAR CORP,1004,423860,5080,AIR,Whlsl,aarcorp.com,1
1,AMERICAN AIRLINES GROUP INC,1045,481111,4512,AAL,Trans,aa.com,5
2,CECO ENVIRONMENTAL CORP,1050,333413,3564,CECE,Mach,cecoenviro.com,2
3,AVX CORP,1072,334416,3670,AVX,Chips,avx.com,3
4,PINNACLE WEST CAPITAL CORP,1075,2211,4911,PNW,Util,pinnaclewest.com,2
...,...,...,...,...,...,...,...,...
3796,LYONDELLBASELL INDUSTRIES NV,294524,325220,2820,LYB,Chems,lyondellbasell.com,2
3797,PLATFORM SPECIALTY PRODUCTS,315318,325320,2870,PAH,Chems,platformspecialtyproducts.com,2
3798,ALLEGION PLC,316056,332510,3420,ALLE,BldMt,allegion.com,2
3799,DORIAN LPG LTD,317264,483111,4412,LPG,Trans,dorianlpg.com,5


In [15]:
ff5_dir_map = {
    1: "Consumer",
    2: "Manufacturing",
    3: "HiTec",
    4: "Health and Medical",
    5: "Other including Finance",
}

for index, company_row in companies.iterrows():
    company_name = company_row["company name"]
    ticker = company_row["ticker"]
    sector = company_row["ff5 sector"]

    base_directory = f'./sec-edgar-filings/{ticker}/10-K'
    output_directory = f'{ff5_dir_map[sector]}/{ticker}.csv'
    
    if not os.path.isfile(output_directory) and os.path.isdir(output_directory):
        try:
            check = download_10k_filings(ticker)
            
            if not check:
                output_df = process_10k_filings(ticker, company_name, base_directory)
                output_df = output_df.sort_values(by="Fill Date", ascending=False)
                output_df.to_csv(output_directory)
                display(output_df)
                print(f"Parsed {ticker}")
            else:
                print(index)

        except:
            print(f"Could not fetch { company_name }.")
    else:
        print(f"Skip {index}")

Skip 0
Skip 1
Skip 2
Skip 3
Skip 4
Skip 5
Skip 6
Skip 7
Skip 8
Skip 9
Skip 10
Skip 11
Skip 12
Skip 13
Skip 14
Skip 15
Skip 16
Skip 17
Skip 18
Skip 19
Skip 20
Skip 21
Skip 22
Skip 23
Skip 24
Skip 25
Skip 26
Skip 27
Skip 28
Skip 29
Skip 30
Skip 31
Skip 32
Skip 33
Skip 34
Skip 35
Skip 36
Skip 37
Skip 38
Skip 39
Skip 40
Skip 41
Skip 42
Skip 43
Skip 44
Skip 45
Skip 46
Skip 47
Skip 48
Skip 49
Skip 50
Skip 51
Skip 52
Skip 53
Skip 54
Skip 55
Skip 56
Skip 57
Skip 58
Skip 59
Skip 60
Skip 61
Skip 62
Skip 63
Skip 64
Skip 65
Skip 66
Skip 67
Skip 68
Skip 69
Skip 70
Skip 71
Skip 72
Skip 73
Skip 74
Skip 75
Skip 76
Skip 77
Skip 78
Skip 79
Skip 80
Skip 81
Skip 82
Skip 83
Skip 84
Skip 85
Skip 86
Skip 87
Skip 88
Skip 89
Skip 90
Skip 91
Skip 92
Skip 93
Skip 94
Skip 95
Skip 96
Skip 97
Skip 98
Skip 99
Skip 100
Skip 101
Skip 102
Skip 103
Skip 104
Skip 105
Skip 106
Skip 107
Skip 108
Skip 109
Skip 110
Skip 111
Skip 112
Skip 113
Skip 114
Skip 115
Skip 116
Skip 117
Skip 118
Skip 119
Skip 120
Skip 121
Skip 122
Ski

In [16]:
company_name = "UDR INC"
ticker = "UG"
sector = 4

base_directory = f'./sec-edgar-filings/{ticker}/10-K'
output_directory = f'./Health and Medical/{ticker}.csv'

check = download_10k_filings(ticker)

output_df = process_10k_filings(ticker, company_name, base_directory)
output_df = output_df.sort_values(by="Fill Date", ascending=False)
output_df.to_csv(output_directory)
display(output_df)

: 