In [1]:
from sec_edgar_downloader import Downloader
import pandas as pd
import os
from bs4 import BeautifulSoup
import re

In [2]:
def download_10k_filings(ticker):
    if not os.path.isdir(f"./sec-edgar-filings/{ticker}"):
        dl = Downloader(ticker, "ramorstem2022@gmail.com")

        # Download all 10-K filings for the given ticker symbol.
        dl.get("10-K", ticker)

In [8]:
def extract_text(raw_text):
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    # f = open(file_path, 'r', encoding='utf-8')
    # raw_text = f.read()

    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_text)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_text)]
    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_text)]
    document = {}

    if len(doc_start_is) == 0:
        return None, None

    # Create a loop to go through each section type and save only the 10-K section in the dictionary
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K':
            document[doc_type] = raw_text[doc_start:doc_end]
    regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

    date_match = re.search(r'FILED AS OF DATE:\s+(\d{8})', raw_text)
    filing_date = "-1"

    # Extract the date string
    if date_match:
        filing_date = date_match.group(1)
        filing_date = pd.to_datetime(filing_date)

    # print(filing_date)

    matches = regex.finditer(document['10-K'])

    # Create the dataframe
    filing_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    filing_df.columns = ['item', 'start', 'end']
    filing_df['item'] = filing_df.item.str.lower()

    # Get rid of unnesesary charcters from the dataframe
    filing_df.replace('&#160;',' ',regex=True,inplace=True)
    filing_df.replace('&nbsp;',' ',regex=True,inplace=True)
    filing_df.replace(' ','',regex=True,inplace=True)
    filing_df.replace('\.','',regex=True,inplace=True)
    filing_df.replace('>','',regex=True,inplace=True)

    pos_dat = filing_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
    pos_dat.set_index('item', inplace=True)

    item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]
    # item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]
    # item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]

    item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')
    risk_factors_text = item_1a_content.get_text()
    
    risk_factors_text = risk_factors_text.replace("\n", "")
    risk_factors_text = risk_factors_text.replace(">Item 1A", "")
    risk_factors_text = risk_factors_text.replace(">Item 1A.", "")
    risk_factors_text = risk_factors_text.strip(" ")
    risk_factors_text = risk_factors_text.strip(".")
    risk_factors_text = risk_factors_text.strip(" ")
    risk_factors_text = risk_factors_text.replace("Risk Factors", "Risk Factors\n")
    return filing_date, risk_factors_text

def process_10k_filings(ticker, company_name, base_directory):
    filings_df = pd.DataFrame(columns=["Ticker", "Company Name", "Fill Date", "Risk Factors Text"])

    # Walk through the directory structure
    for root, dirs, files in os.walk(base_directory):
        for file in files:
            try:
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                        filing_date, risk_section = extract_text(content)
                        if risk_section:                            
                            filings_df = pd.concat([filings_df, pd.DataFrame.from_records([{
                                "Ticker": ticker,
                                "Company Name": company_name,
                                "Fill Date": filing_date,
                                "Risk Factors Text": risk_section
                            }])], ignore_index=True)
            except:
                pass

    return filings_df



ticker = "AAPL"
# download_10k_filings("MSFT")
# Specify the directory where your 10-K filings are stored
base_directory = f'./sec-edgar-filings/{ticker}/10-K'

output_directory = f'{ticker}.csv'
company_name = "Apple Inc."
output_df = process_10k_filings(ticker, company_name, base_directory)
output_df = output_df.sort_values(by="Fill Date", ascending=False)
output_df.to_csv(output_directory)

output_df


Unnamed: 0,Ticker,Company Name,Fill Date,Risk Factors Text
14,AAPL,Apple Inc.,2023-11-03,"Risk Factors\nThe Company’s business, repu..."
6,AAPL,Apple Inc.,2022-10-28,"Risk Factors\nThe Company’s business, repu..."
10,AAPL,Apple Inc.,2021-10-29,"Risk Factors\nThe Company’s business, repu..."
12,AAPL,Apple Inc.,2020-10-30,Risk Factors\nThe following discussion of ...
2,AAPL,Apple Inc.,2019-10-31,Risk Factors\nThe following discussion of risk...
11,AAPL,Apple Inc.,2018-11-05,Risk Factors\nThe following discussion of risk...
4,AAPL,Apple Inc.,2017-11-03,Risk Factors\nThe following discussion of risk...
8,AAPL,Apple Inc.,2016-10-26,Risk Factors\nThe following discussion of risk...
7,AAPL,Apple Inc.,2015-10-28,Risk Factors\n The following discussion of ri...
16,AAPL,Apple Inc.,2014-10-27,Risk Factors\n The following discussion of ri...


In [20]:
companies = pd.read_csv("companies.csv")
companies

Unnamed: 0,company name,gvkey,NAICS,SIC,ticker,industry sector,weburl,ff5 sector
0,AAR CORP,1004,423860,5080,AIR,Whlsl,aarcorp.com,1
1,AMERICAN AIRLINES GROUP INC,1045,481111,4512,AAL,Trans,aa.com,5
2,CECO ENVIRONMENTAL CORP,1050,333413,3564,CECE,Mach,cecoenviro.com,2
3,AVX CORP,1072,334416,3670,AVX,Chips,avx.com,3
4,PINNACLE WEST CAPITAL CORP,1075,2211,4911,PNW,Util,pinnaclewest.com,2
...,...,...,...,...,...,...,...,...
3796,LYONDELLBASELL INDUSTRIES NV,294524,325220,2820,LYB,Chems,lyondellbasell.com,2
3797,PLATFORM SPECIALTY PRODUCTS,315318,325320,2870,PAH,Chems,platformspecialtyproducts.com,2
3798,ALLEGION PLC,316056,332510,3420,ALLE,BldMt,allegion.com,2
3799,DORIAN LPG LTD,317264,483111,4412,LPG,Trans,dorianlpg.com,5


In [32]:
ff5_dir_map = {
    1: "Consumer",
    2: "Manufacturing",
    3: "HiTec",
    4: "Health and Medical",
    5: "Other including Finance",
}

for index, company_row in companies.iterrows():
    company_name = company_row["company name"]
    ticker = company_row["ticker"]
    sector = company_row["ff5 sector"]
    try:

        download_10k_filings(ticker)
        base_directory = f'./sec-edgar-filings/{ticker}/10-K'

        output_directory = f'{ff5_dir_map[sector]}/{ticker}.csv'


        output_df = process_10k_filings(ticker, company_name, base_directory)
        output_df = output_df.sort_values(by="Fill Date", ascending=False)
        output_df.to_csv(output_directory)

        display(output_df)
    except:
        print(f"Could not fetch { company_name }.")

Unnamed: 0,Ticker,Company Name,Fill Date,Risk Factors Text
8,AIR,AAR CORP,2023-07-18,ITEM 1A.RISK FACTORSThe following is a descrip...
4,AIR,AAR CORP,2022-07-21,ITEM 1A.RISK FACTORSThe following is a descrip...
5,AIR,AAR CORP,2021-07-21,ITEM 1A.RISK FACTORSThe following is a descrip...
9,AIR,AAR CORP,2020-07-21,ITEM 1A.RISK FACTORSThe following is a descrip...
12,AIR,AAR CORP,2019-07-18,ITEM 1A. RISK FACTORS The follo...
3,AIR,AAR CORP,2018-07-11,ITEM 1A. RISK FACTORS The follo...
15,AIR,AAR CORP,2017-07-12,ITEM 1A. RISK FACTORS The follo...
10,AIR,AAR CORP,2016-07-13,ITEM 1A. RISK FACTORS The follo...
13,AIR,AAR CORP,2015-07-15,ITEM 1A. RISK FACTORS The follo...
11,AIR,AAR CORP,2014-07-17,ITEM 1A. RISK FACTORS The follo...


Unnamed: 0,Ticker,Company Name,Fill Date,Risk Factors Text
12,AAL,AMERICAN AIRLINES GROUP INC,2024-02-21,Risk Factors\n21
3,AAL,AMERICAN AIRLINES GROUP INC,2023-02-22,Risk Factors\n28
7,AAL,AMERICAN AIRLINES GROUP INC,2022-02-22,Risk Factors\n31
6,AAL,AMERICAN AIRLINES GROUP INC,2021-02-17,Risk Factors\n28
0,AAL,AMERICAN AIRLINES GROUP INC,2020-02-19,Risk Factors\n16
10,AAL,AMERICAN AIRLINES GROUP INC,2019-02-25,Risk Factors\n16
4,AAL,AMERICAN AIRLINES GROUP INC,2018-02-21,Risk Factors\n15
11,AAL,AMERICAN AIRLINES GROUP INC,2017-02-22,Risk Factors\n 16
8,AAL,AMERICAN AIRLINES GROUP INC,2016-02-24,Risk Factors\n 22
5,AAL,AMERICAN AIRLINES GROUP INC,2015-02-25,Risk Factors\n 30


Could not fetch CECO ENVIRONMENTAL CORP.
Could not fetch AVX CORP.


Unnamed: 0,Ticker,Company Name,Fill Date,Risk Factors Text
17,PNW,PINNACLE WEST CAPITAL CORP,2024-02-27,ITEM 1A. RISK FACTORSIn addition to the facto...
11,PNW,PINNACLE WEST CAPITAL CORP,2023-02-27,ITEM 1A. RISK FACTORSIn addition to the facto...
2,PNW,PINNACLE WEST CAPITAL CORP,2022-02-25,ITEM 1A. RISK FACTORS In addition to the fact...
16,PNW,PINNACLE WEST CAPITAL CORP,2021-02-24,ITEM 1A. RISK FACTORS In addition to the fact...
13,PNW,PINNACLE WEST CAPITAL CORP,2020-02-21,ITEM 1A. RISK FACTORS In addition to the fac...
0,PNW,PINNACLE WEST CAPITAL CORP,2019-02-22,ITEM 1A. RISK FACTORS In addition to the fac...
10,PNW,PINNACLE WEST CAPITAL CORP,2018-02-23,ITEM 1A. RISK FACTORS In addition to the fac...
4,PNW,PINNACLE WEST CAPITAL CORP,2017-02-24,ITEM 1A. RISK FACTORS In addition to the fact...
15,PNW,PINNACLE WEST CAPITAL CORP,2016-02-19,ITEM 1A. RISK FACTORS In addition to the fact...
9,PNW,PINNACLE WEST CAPITAL CORP,2015-02-20,ITEM 1A. RISK FACTORS In addition to the fact...


Unnamed: 0,Ticker,Company Name,Fill Date,Risk Factors Text


Unnamed: 0,Ticker,Company Name,Fill Date,Risk Factors Text
3,ABT,ABBOTT LABORATORIES,2024-02-16,ITEM 1A. RISK FACTORS In addition to th...
7,ABT,ABBOTT LABORATORIES,2023-02-17,ITEM 1A. RISK FACTORSIn addition to the other ...
1,ABT,ABBOTT LABORATORIES,2022-02-18,ITEM 1A. RISK FACTORSIn addition to the other ...
15,ABT,ABBOTT LABORATORIES,2021-02-19,ITEM 1A. RISK FACTORSIn addition to the other ...
12,ABT,ABBOTT LABORATORIES,2020-02-21,ITEM 1A. RISK FACTORSIn addition to the other ...
0,ABT,ABBOTT LABORATORIES,2019-02-22,ITEM 1A. RISK FACTORS In additi...
6,ABT,ABBOTT LABORATORIES,2018-02-16,ITEM 1A. RISK FACTORS In additi...
5,ABT,ABBOTT LABORATORIES,2017-02-17,ITEM 1A. RISK FACTORS In additi...
13,ABT,ABBOTT LABORATORIES,2016-02-19,ITEM 1A. RISK FACTORS In additi...
17,ABT,ABBOTT LABORATORIES,2015-02-27,ITEM 1A. RISK FACTORS In additi...


Unnamed: 0,Ticker,Company Name,Fill Date,Risk Factors Text
1,ACET,ACETO CORP,2024-03-19,Risk Factors\n. Investing in our common stock ...
0,ACET,ACETO CORP,2023-03-15,Risk Factors\n. Investing in our common stock ...
2,ACET,ACETO CORP,2022-03-15,Risk Factors\n. Investing in our common stock ...
4,ACET,ACETO CORP,2021-03-12,Risk Factors\n. Investing in our common stock ...
3,ACET,ACETO CORP,2020-03-12,Risk Factors\n. Investing in our common stock ...
5,ACET,ACETO CORP,2019-03-18,Risk Factors\n.Investing in our common stock i...
6,ACET,ACETO CORP,2018-03-29,Risk Factors\n.Investing in our common stock i...


Unnamed: 0,Ticker,Company Name,Fill Date,Risk Factors Text
0,ACU,ACME UNITED CORP,2024-03-07,Risk Factors\nOwnership of the Company’s secur...
6,ACU,ACME UNITED CORP,2023-03-10,Risk Factors\nOwnership of the Company’s secur...
7,ACU,ACME UNITED CORP,2022-03-30,Risk Factors\nOwnership of the Company’s secur...
1,ACU,ACME UNITED CORP,2021-03-31,Risk Factors\nOwnership of the Company’s secur...
10,ACU,ACME UNITED CORP,2020-03-13,Risk Factors\nThe Company is subject to a numb...
14,ACU,ACME UNITED CORP,2019-03-15,Risk Factors\n The Company is subject to a num...
3,ACU,ACME UNITED CORP,2018-03-15,Risk Factors\n The Company is subject to a num...
9,ACU,ACME UNITED CORP,2017-03-10,Risk Factors\n The Company is subject to a num...
15,ACU,ACME UNITED CORP,2016-03-11,Risk Factors\n The Company is subject to a num...
5,ACU,ACME UNITED CORP,2015-03-06,Risk Factors\n The Company is subject to a num...
