In [1]:
import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import matplotlib.pyplot as plt
import pandas as pd
from sec_edgar_downloader import Downloader
# https://sec-edgar-downloader.readthedocs.io/en/latest/

#### Download Files

In [2]:
dl = Downloader("data/")
dl.get("10-K", "MSFT", amount=1)

1

#### Youtube Way (Filings after 2009)

##### Functions

In [2]:
'''
Link
https://github.com/areed1192/sigma_coding_youtube/blob/master/python/python-finance/sec-web-scraping/Web%20Scraping%20SEC%20-%20Parsing%20SEC%20Documents%20-%20New%20Filings.ipynb

'''
# Function - Decode/normalize text
# sec filings are a mess, we need normalize/decode texts
# so we need apply the following function. source provided above
# function - finds qualifying matches and decodes them, replace into string
def restore_windows_1252_characters(restore_string):
    """
        Replace C1 control characters in the Unicode string s by the
        characters at the corresponding code points in Windows-1252,
        where possible.
    """

    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode('windows-1252')
        except UnicodeDecodeError:
            # No character at the corresponding code point: remove it.
            return ''
        
    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

In [3]:
# load local html file

path = r"C:\Users\Gumo\Desktop\Git\Class\CIS43_DataTextMining\data\sec-edgar-filings\MSFT\10-K\0001564590-21-039151\full-submission.txt"
with open(path) as html_file:
    soup = BeautifulSoup(html_file, 'lxml')

##### Variable - Filing

In [4]:
# initialize dictionary

# define a dictionary that will house all filings.
master_filings_dict = {}

# let's use the accession number as the key. This 
accession_number = '0001564590-21-039151'

# add a new level to our master_filing_dict, this will also be a dictionary.
master_filings_dict[accession_number] = {}

# this dictionary will contain two keys, the sec header content, and a documents key.
master_filings_dict[accession_number]['sec_header_content'] = {}
master_filings_dict[accession_number]['filing_documents'] = None

In [5]:
# grab the sec-header tag, so we can store it in the master filing dictionary.
sec_header_tag = soup.find('sec-header')

# store the tag in the dictionary just as is.
master_filings_dict[accession_number]['sec_header_content']['sec_header_code'] = sec_header_tag

# display the sec header tag, so you can see how it looks.
sec_header_tag

<sec-header>0001564590-21-039151.hdr.sgml : 20210729
<acceptance-datetime>20210729162155
ACCESSION NUMBER:		0001564590-21-039151
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		133
CONFORMED PERIOD OF REPORT:	20210630
FILED AS OF DATE:		20210729
DATE AS OF CHANGE:		20210729

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			MICROSOFT CORP
		CENTRAL INDEX KEY:			0000789019
		STANDARD INDUSTRIAL CLASSIFICATION:	SERVICES-PREPACKAGED SOFTWARE [7372]
		IRS NUMBER:				911144442
		STATE OF INCORPORATION:			WA
		FISCAL YEAR END:			0630

	FILING VALUES:
		FORM TYPE:		10-K
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-37845
		FILM NUMBER:		211127769

	BUSINESS ADDRESS:	
		STREET 1:		ONE MICROSOFT WAY
		CITY:			REDMOND
		STATE:			WA
		ZIP:			98052-6399
		BUSINESS PHONE:		425-882-8080

	MAIL ADDRESS:	
		STREET 1:		ONE MICROSOFT WAY
		CITY:			REDMOND
		STATE:			WA
		ZIP:			98052-6399
</acceptance-datetime></sec-header>

##### Variable - Document

In [6]:
# initialize dict for documents filed
master_document_dict = {}

# loop through
for filing_document in soup.find_all('document'):
    
    # define document id:   recursive false only returns text with the "type" tag
    document_id = filing_document.type.find(text=True, recursive=False).strip()


    # document sequence
    document_sequence = filing_document.sequence.find(text=True, recursive=False).strip()

    # document file name
    document_filename = filing_document.filename.find(text=True, recursive=False).strip()
    
    # document description
    document_description = filing_document.description.find(text=True, recursive=False).strip()
    
    # initalize document dictionary with document id
    master_document_dict[document_id] = {}

    # add the parts into the dict
    master_document_dict[document_id]['document_sequence'] = document_sequence
    master_document_dict[document_id]['document_filename'] = document_filename
    master_document_dict[document_id]['document_description'] = document_description

    # store "everything" of the document content itself into document_code
    master_document_dict[document_id]['document_code'] = filing_document.extract()

    # store all text in the filing_doc_text
    filing_doc_text = filing_document.find('text').extract()

    # store page break 
    all_thematic_breaks = filing_doc_text.find_all('hr',{'style':'page-break-after:always'})

    # # convert the breaks into string
    # all_thematic_breaks = [str(thematic_break) for thematic_break in all_thematic_breaks]

    # # prepare to split, so convert the doc text into string format
    # filing_doc_string = str(filing_doc_text)

    # if len(all_thematic_breaks) > 0:

    #     # create pattern to split
    #     regex_delimiter_pattern = '|'.join(map(re.escape, all_thematic_breaks))

    #     # split the document with regex pattern
    #     split_filing_string = re.split(regex_delimiter_pattern, filing_doc_string)

    #     #store in dict
    #     master_document_dict[document_id]['pages_code'] = split_filing_string
    # elif len(all_thematic_breaks) == 0:

    #     split_filing_string = all_thematic_breaks
    #     # if its 0 store into a list
    #     master_document_dict[document_id]['pages_code'] = [split_filing_string]
    
# # store back to master filing dict
# master_filings_dict[accession_number]['filing_documents'] = master_document_dict    

# store the documents in the master_filing_dictionary.
master_filings_dict[accession_number]['filing_documents'] = master_document_dict

print('-'*80)
print('All the documents for filing {} were parsed and stored.'.format(accession_number))



--------------------------------------------------------------------------------
All the documents for filing 0001564590-21-039151 were parsed and stored.


In [7]:
master_filings_dict

{'0001564590-21-039151': {'sec_header_content': {'sec_header_code': <sec-header>0001564590-21-039151.hdr.sgml : 20210729
   <acceptance-datetime>20210729162155
   ACCESSION NUMBER:		0001564590-21-039151
   CONFORMED SUBMISSION TYPE:	10-K
   PUBLIC DOCUMENT COUNT:		133
   CONFORMED PERIOD OF REPORT:	20210630
   FILED AS OF DATE:		20210729
   DATE AS OF CHANGE:		20210729
   
   FILER:
   
   	COMPANY DATA:	
   		COMPANY CONFORMED NAME:			MICROSOFT CORP
   		CENTRAL INDEX KEY:			0000789019
   		STANDARD INDUSTRIAL CLASSIFICATION:	SERVICES-PREPACKAGED SOFTWARE [7372]
   		IRS NUMBER:				911144442
   		STATE OF INCORPORATION:			WA
   		FISCAL YEAR END:			0630
   
   	FILING VALUES:
   		FORM TYPE:		10-K
   		SEC ACT:		1934 Act
   		SEC FILE NUMBER:	001-37845
   		FILM NUMBER:		211127769
   
   	BUSINESS ADDRESS:	
   		STREET 1:		ONE MICROSOFT WAY
   		CITY:			REDMOND
   		STATE:			WA
   		ZIP:			98052-6399
   		BUSINESS PHONE:		425-882-8080
   
   	MAIL ADDRESS:	
   		STREET 1:		ONE MICROSO

#### Professor Way

In [3]:
# load local html file

path = r"C:\Users\Gumo\Desktop\Git\Class\CIS43_DataTextMining\data\sec-edgar-filings\MSFT\10-K\0001564590-21-039151\full-submission.txt"
with open(path) as html_file:
    soup = BeautifulSoup(html_file, '')

In [10]:
soup.find_all("table")

In [10]:
soup.find("p",{'id':'BALANCE SHEETS'})

1

In [None]:
'''
SEC Parsing youtube
https://www.youtube.com/watch?v=TxUmufNnIaA&t=2s

Github
https://github.com/areed1192/sigma_coding_youtube/blob/master/python/python-finance/sec-web-scraping/Web%20Scraping%20SEC%20-%20Parsing%20SEC%20Documents%20-%20New%20Filings.ipynb

sec api documentation
https://sec-edgar-downloader.readthedocs.io/en/latest/

beautifulsoup
https://www.youtube.com/watch?v=ng2o98k983k




'''