# Import Relevant Libraries

In [1]:
import requests
import bs4 as bs
import pandas as pd
from fake_useragent import UserAgent
import unicodedata
import json
import os
import time
import lxml
import cchardet
import stopit
import itertools

# Environment Specifics

In [2]:
print('python version: ', os.sys.version)
print('pandas version: ', pd.__version__)
print('requests version: ', requests.__version__)
print('bs4 version: ', bs.__version__)
print('lxml version: ', lxml.__version__)
print('cchardet version: ', cchardet.__version__)
print('stopit version: ', stopit.__version__)

python version:  3.10.6 | packaged by conda-forge | (main, Aug 22 2022, 20:38:29) [Clang 13.0.1 ]
pandas version:  1.5.0
requests version:  2.28.1
bs4 version:  4.11.1
lxml version:  4.9.3
cchardet version:  2.1.7
stopit version:  1.1.2


# Helper Functions

Helper functions are defined below. These functions are used to scrape the data from the SEC and Wikipedia. Each function is carefully documented.

In [2]:
def get_soup(url: str, 
             headers: dict = {'User-Agent': UserAgent().chrome}, 
             max_timeout: int = 60) -> bs.BeautifulSoup:
    """Get the soup object from a URL

    Args:
        url (str): URL.
        headers (dict): Headers to be used in the request. Defaults to Chrome user agent.
        max_timeout (int, optional): Maximum timeout in seconds. Defaults to 60.

    Returns:
        bs.BeautifulSoup: Soup object.
    """
    # Get the response
    response = requests.get(url, headers=headers)
    # Transform the response into a soup object
    with stopit.ThreadingTimeout(max_timeout) as timeout_ctx:
        soup = bs.BeautifulSoup(response.text, 'lxml')
    # If the parsing is successful, return the soup object
    if timeout_ctx.state == timeout_ctx.EXECUTED:
        return soup
    # Otherwise, try by using a strainer to parse only the div tags
    with stopit.ThreadingTimeout(max_timeout) as timeout_ctx:
        soup_strainer = bs.SoupStrainer('div')
        soup = bs.BeautifulSoup(response.text, 'lxml', parse_only=soup_strainer)
    # If the parsing is successful, return the soup object
    if timeout_ctx.state == timeout_ctx.EXECUTED:
        return soup
    # Otherwise, try by using a strainer to parse only the p tags
    with stopit.ThreadingTimeout(max_timeout) as timeout_ctx:
        soup_strainer = bs.SoupStrainer('p')
        soup = bs.BeautifulSoup(response.text, 'lxml', parse_only=soup_strainer)
    # If the parsing is successful, return the soup object
    if timeout_ctx.state == timeout_ctx.EXECUTED:
        return soup
    # If the parsing is not successful, return None
    return None

def find_tags_with_text(soup: bs.BeautifulSoup,
                        text_start: str = '', 
                        text_end: str = '', 
                        types: list = ['div', 'p']) -> list:
    """Find all tags of a given type that start with a given text and end with another given text

    Args:
        soup (bs.BeautifulSoup): Soup object.
        text_start (str): Starting text of the tag. Defaults to ''.
        text_end (str): Ending text of the tag. Defaults to ''.
        types (list): List of types of tag to be searched for. Defaults to ['div', 'p'].

    Returns:
        list: List of tags that match the criteria.
    """
    
    # Initialize containers for candidate and matching tags
    candidate_tags = []
    matching_tags = []
    
    # Normalize text
    text_start, text_end = text_start.lower(), text_end.lower()
    
    # For each type of tag, find all tags that contain the starting or ending text
    for type in types:
        for tag in soup.find_all(type):
            # Normalize text
            tag_text = unicodedata.normalize('NFKD', tag.text.lower())
            if (text_end in tag_text) or (text_start in tag_text):
                candidate_tags.append(tag)
    
    # For each candidate tag, check if the tag itself or the tag and the next tag contain the starting and ending text
    for i, tag in enumerate(candidate_tags):
        # Normalize text
        tag_text = unicodedata.normalize('NFKD', tag.text.lower()).strip()
        # Get the text of the next tag if it exists
        if i<(len(candidate_tags)-1):
            next_tag = candidate_tags[i+1]
            next_tag_text = unicodedata.normalize('NFKD', next_tag.text.lower()).strip()
        else:
            next_tag_text = ''
        # Check matching criteria
        if tag_text.startswith(text_start) and tag_text.endswith(text_end):
            matching_tags.append(tag)
        elif tag_text.startswith(text_start) and tag_text[:-1].endswith(text_end):
            matching_tags.append(tag)
        elif tag_text.startswith(text_start) and next_tag_text.startswith(text_end):
            matching_tags.append(tag)
            
    return matching_tags

def get_text_between_tags(soup: bs.BeautifulSoup,
                          start_tag: bs.element.Tag, 
                          end_tag: bs.element.Tag,
                          types: list = ['div', 'p']) -> str:
    """Get the text between two tags

    Args:
        soup (bs.BeautifulSoup): Soup object.
        start_tag (bs.element.Tag): Starting tag.
        end_tag (bs.element.Tag): Ending tag.

    Returns:
        str: Text between the two tags.
    """
    # Initialize container for text
    text = []
    
    # Get all tags
    all_tags = soup.find_all(types)
    
    # Get the index of the starting and ending tags
    start_tag_index = all_tags.index(start_tag)
    end_tag_index = all_tags.index(end_tag)
    
    # If the starting tag is after the ending tag, return empty
    if start_tag_index > end_tag_index:
        return ''
    
    # Extract the text between the two tags
    for tag in all_tags[start_tag_index:end_tag_index]:
        text.append(tag.text)
        
    # Join the text
    text = ' '.join(text)
    
    return text

def get_item1_from_soup(soup: bs.BeautifulSoup) -> str:
    """Get the item 1 text from a soup object

    Args:
        soup (bs.BeautifulSoup): Soup object.

    Returns:
        str: Item 1 text.
    """
    # Specify diffrent combinations of starting and ending text
    start_tags_start_text = ['item 1', 'part I', 'item']
    start_tags_end_text = ['business']
    end_tags_start_text = ['item 1a', 'part I', 'item']
    end_tags_end_text = ['risk factors']
    
    # Find all combinations
    combinations = list(itertools.product(
        start_tags_start_text, start_tags_end_text, end_tags_start_text, end_tags_end_text))
    
    # Attempt to extract text. As soon as one combination works, return the text
    for combination in combinations:
        # Find all tags that match the criteria
        start_tags = find_tags_with_text(soup, text_start=combination[0], text_end=combination[1])
        end_tags = find_tags_with_text(soup, text_start=combination[2], text_end=combination[3])
        # If there is at least one starting and one ending tag, extract the text between them
        if len(start_tags)>0 and len(end_tags)>0:
            text = get_text_between_tags(soup, start_tags[-1], end_tags[-1])
            if len(text)>10:
                return text
    
    # If no combination works, return empty
    return ''

# Scrape SEC Data

The cell below scrapes the SEC data. The data is scraped from the SEC website and saved in several JSON files, one for each company. A detailed description of the process followed can be found in the accompanying report.

In [3]:
# Load mappings
mappings = pd.read_csv('mappings.csv')

# Only consider companies with a valid 10-K URL
mappings = mappings.dropna(subset=['url'])

# Get the text from Item 1 of the 10-K documents
os.makedirs('text_data', exist_ok=True)
for cik, url in mappings[['cik', 'url']].values:
    if os.path.exists(f"text_data/{cik}.json"):
        continue
    start = time.time()
    print(f"Getting soup for {cik}")
    soup = get_soup(url)
    if soup is None:
        continue
    print(f"Getting item 1 for {cik}")
    text = get_item1_from_soup(soup)
    end = time.strftime("%H:%M:%S", time.gmtime(time.time() - start))
    res = {'cik': cik, 'item_1A': text, 'time': end}
    with open(f"text_data/{cik}.json", "w") as outfile:
        json.dump(res, outfile)

Getting soup for 8670
Getting soup for 860731
Getting soup for 759944
Getting item 1 for 759944
Getting soup for 1022079
Getting item 1 for 1022079
Getting soup for 920522
Getting item 1 for 920522
Getting soup for 217346
Getting item 1 for 217346
Getting soup for 60086
Getting item 1 for 60086
Getting soup for 62996
Getting item 1 for 62996
Getting soup for 1601712
Getting item 1 for 1601712
Getting soup for 1679273
Getting item 1 for 1679273
Getting soup for 945841
Getting item 1 for 945841
Getting soup for 93556
Getting item 1 for 93556
Getting soup for 91440
Getting item 1 for 91440
Getting soup for 1336920
Getting item 1 for 1336920
Getting soup for 789570
Getting item 1 for 789570
Getting soup for 849399
Getting item 1 for 849399
Getting soup for 75677
Getting item 1 for 75677
Getting soup for 1324404
Getting item 1 for 1324404
Getting soup for 38777
Getting item 1 for 38777
Getting soup for 1145197
Getting item 1 for 1145197
Getting soup for 23217
Getting item 1 for 23217
Gettin