In [9]:
from splinter import Browser
from bs4 import BeautifulSoup as souper
import pandas as pd
import re
import time

### Setup Functions to Help With Scraping
-----------------------------------------

In [82]:
def tableScrape(table, formtype, tabletype):
    '''
    Function to scrape SEC tables for Meta, Inc. Expense Reports

    Parameters: 
            table: html text to scrape (string)
            formtype: specify the form type (string)
            tabletype: table from report being scraped (string)

    Returns:
            ret_dict: dictionary containing relevant table data (dictionary obj)
    '''
    
    # Select index values for rows with a dollar sign vs rows without
    dollar, no_dollar, cell_indexes = specifyReportType(formtype, tabletype)   
    # Get relevant rows from table
    ret_dict = {}
    for tr in trs:
        tds = tr.find_all("td")
        if type(tds) != "bool": # Filters boolean rows which wouldn't have a len property
            ltds = len(tds)
            if (ltds == no_dollar) | (ltds == dollar): # rows with numeric values have specific number of tags
                for i in range(len(tds)):
                    try: # converting values to integers only works on cells with numeric values
                        if i == 0:
                            title = tds[i].text # Row label always in first row in these tables
                            ret_dict[title] = []
                        if i in cell_indexes[ltds]:
                            amount = tds[i].text
                            amount = amount.split("\xa0")[0]
                            if amount[0] == "(":
                                amount = amount[1:-1]
                            amounts = amount.split(",")
                            amount = int("".join(amounts))
                            ret_dict[title] += [amount]
                    except: # skip rows that don't contain relevant numeric data
                        try:
                            del ret_dict[title]
                        except:
                            continue
                        continue
    
    return ret_dict

def getShareRepurchases(table):
    '''
    Function specifically to get share repurchase data

    Parameters:
            table: html text to scrape (string)

    Returns:
            data: dictionary containing share repurchase data (dictionary obj)
    '''
    
    # Get Share Repurchase Data
    trs = table.find_all("tr")
    data = {"Share repurchases": []}
    for tr in trs:
        tds = tr.find_all("td")
        if tds[0].text == "Share repurchases": # isolates only Share repurchase row
            shares = [int(tds[1].text[1:-1])]
            data["Share repurchases"] += shares
    return data

def listIndexes(num_cells, rtype):
    '''
    Generates list of indexes to scrape for each report type

    Arguments:
        num_cells: number of cells in row (int)
        rtype: choice for columns with dollar signs and without (string)
    Returns:
        ret_list: list of indexes for cells with numeric data
    '''
    ret_list = [] 
    counter = num_cells - 2
    if rtype == "dollar":
        while counter > 0:
            ret_list.append(counter)
            counter -= 4
    if rtype == "no_dollar":
        while counter > 0:
            ret_list.append(counter)
            counter -= 3
    return ret_list

def specifyReportType(formtype, tabletype):
    '''
    Function to get values to use for proper report scraping

    Arguments:
        formtype: time interval of the form being scraped (string)
        tabletype: the name of the table being scraped (string)

    Returns:
        dollar: number of cells for rows with dollar signs (int)
        no_dollar: number of cells for rows with no dollar signs (int)
        indexes: dictionary of values to reference for tableScrape (dictionary obj)
    '''
    if tabletype == "CONSOLIDATED BALANCE SHEETS":
        dollar = 8
        no_dollar = 6
    if formtype == "10-Q":
        if tabletype == "CONSOLIDATED STATEMENTS OF INCOME":
            dollar = 16
            no_dollar = 12
    if formtype == "10-K":
        if tabletype == "CONSOLIDATED STATEMENTS OF INCOME":
            dollar = 12
            no_dollar = 9
    d = listIndexes(dollar, "dollar")
    nd = listIndexes(no_dollar, "no_dollar")
    indexes = {dollar: d, no_dollar: nd}
    return dollar, no_dollar, indexes

def pagePrep():
    '''
    Navigates to SEC Report Filings Page for Meta, Inc. and Lists Reports
    '''
    # Navigate to main tab and close all others
    try:
        browser.windows.current = browser.windows[0]
        window = browser.windows[0]
        window.close_others()
    except:
        None
    
    # Navigate to EDGAR landing page for specified company
    base_url = "https://www.sec.gov/edgar/browse/?CIK=1326801&owner=exclude" # SEC reports for Meta, Inc.
    browser.visit(base_url)
    parent_card_div = browser.find_by_id('filingsStart')
    all_card_divs = parent_card_div.find_by_css("div[class='card']")
    third_div = all_card_divs[3] # div card for quarterly and annual reports
    third_div.click() # activate javascript to open buttons
    third_div_child = third_div.find_by_tag("div").first
    third_div_child.find_by_tag("button").first.click() # button action to list reports
    time.sleep(1) # timeout so that webpage can load before further actions taken
    return

def pageVisit(report):
    '''
    Function to get to specified report page
    '''
    pagePrep()
    browser.links.find_by_text(report).first.click()
    browser.windows.current = browser.windows[1] # navigate to newly opened window

    # Change XML Report into basic HTML page
    clicked_url = browser.url
    url_parts = clicked_url.split("/ix?doc=")
    clean_url = "".join(url_parts)
    browser.visit(clean_url)
    return

def findEndBracket(text):
    '''
    Function to find index value of End Bracket in text

    Parameters:
        text: text string to search (string)
    Returns:
        i: index value of text (int)
    '''
    for i, data in enumerate(text):
        if data == "]":
            return i
    return None

def getReportsList(title):
    '''
    Build DataFrame of the list of reports on the first page of the Reports list

    Parameters:
        title: company name being used (string)
    Returns:
        form_frame: dataframe of reprots on first page of list (DataFrame Object)
    '''
    # Build DataFrame
    pagePrep()
    html = browser.html
    souped = souper(html, "html.parser")
    header_div = souped.find("div", class_="dataTables_scrollHead")
    header_table = header_div.find("table")
    headers = [h.text for h in header_table.find_all("th")]
    data_table = souped.find("table", id="filingsTable")
    body = data_table.find("tbody")
    data = [[td.text for td in tr.find_all("td")] for tr in body.find_all("tr")]
    form_frame = pd.DataFrame(data=data, columns=headers)
    
    # Clean Columns and export/return cleaned DataFrame
    extra_text = "View all with same reporting date"
    form_frame["Form description"] = [data[:1+findEndBracket(data)] for data in form_frame["Form description"].values]
    form_frame["Reporting date"] = [data[: -len(extra_text)] for data in form_frame["Reporting date"].values]
    form_frame.to_csv(f'./Resources/EDGAR_landing_page_{title}.csv')
    return form_frame

def getTable(ttext):
    '''
    Function to return all tags from specified table

    Parameters:
        ttext: the text needed to find specific table (string)
    Returns:
        table: all tags/text from specifed table (string)
    '''
    find_string = souped.find("span", string=re.compile(ttext))
    if find_string == None:
        print("Table Text Not Found")
        return
    table_div = find_string.parent
    table = table_div.table
    while table == None:
        table_div = table_div.next_sibling
        table = table_div.table
    return table

## Gather Necessary Data
------------------------

In [17]:
# Setup Scraping Browser
browser = Browser("chrome")

In [53]:
reports_df = getReportsList("meta")
reports_df

Unnamed: 0,Form type,Form description,Filing date,Reporting date
0,10-Q,Quarterly report [Sections 13 or 15(d)],2024-08-01,2024-06-30
1,10-Q,Quarterly report [Sections 13 or 15(d)],2024-04-25,2024-03-31
2,10-K,"Annual report [Section 13 and 15(d), not S-K I...",2024-02-02,2023-12-31
3,10-Q,Quarterly report [Sections 13 or 15(d)],2023-10-26,2023-09-30
4,10-Q,Quarterly report [Sections 13 or 15(d)],2023-07-27,2023-06-30
5,10-Q,Quarterly report [Sections 13 or 15(d)],2023-04-27,2023-03-31
6,10-K,"Annual report [Section 13 and 15(d), not S-K I...",2023-02-02,2022-12-31
7,10-Q,Quarterly report [Sections 13 or 15(d)],2022-10-27,2022-09-30
8,10-Q,Quarterly report [Sections 13 or 15(d)],2022-07-28,2022-06-30
9,10-Q,Quarterly report [Sections 13 or 15(d)],2022-04-28,2022-03-31


In [72]:
num_reports = 3
recent_quarterly = reports_df[reports_df["Form type"] == "10-Q"][:num_reports]
table_texts = ["CONSOLIDATED BALANCE SHEETS",
               "CONSOLIDATED STATEMENTS OF INCOME",
              "CONSOLIDATED STATEMENTS OF STOCKHOLDERS' EQUITY"]

In [90]:
df_list = []
for report in recent_quarterly["Form description"]:
    pageVisit(form)
    html = browser.html
    souped = souper(html, "html.parser")
    for i, text in enumerate(table_texts):
        print(i)
        table = getTable(text)
        if i != 2:
            data = tableScrape(table, form, text)
        else:
            data = getShareRepurchases(table)
        df = pd.DataFrame(data)
        df_list.append(df)

0
1


UnboundLocalError: cannot access local variable 'dollar' where it is not associated with a value

In [83]:
form = reports_df["Form description"][0]
pageVisit(form)
html = browser.html
souped = souper(html, "html.parser")
text = table_texts[0]
table = getTable(text)
data = tableScrape(table, form, text)

Unnamed: 0,Cash and cash equivalents,Marketable securities,"Accounts receivable, net",Prepaid expenses and other current assets,Total current assets,Non-marketable equity securities,"Property and equipment, net",Operating lease right-of-use assets,Goodwill,Other assets,...,"Operating lease liabilities, non-current",Long-term debt,Long-term income taxes,Other liabilities,Total liabilities,Additional paid-in capital,Accumulated other comprehensive loss,Retained earnings,Total stockholders' equity,Total liabilities and stockholders' equity
0,32045,26035,14505,3846,76431,6207,102959,14058,20654,9929,...,17685,18389,7897,2500,73475,78270,2695,81188,156763,230238
1,41862,23541,16169,3793,85365,6141,96587,13294,20654,7582,...,17226,18385,7514,1370,76455,73253,2155,82070,153168,229623


In [149]:
# Scrape and Sort Data From Finance Reports
b_sheets = []
si = []
buybacks = []
for j, url in enumerate(urls):
    print(f"pending j: {j}")
    browser.visit(url)
    time.sleep(1) # allow 1 second to load webpages before attempting to scrape tags
    html = browser.html
    souped = souper(html, "html.parser")
    for i, text in enumerate(table_texts):
        print(f"pending i: {i}")
        # Hardcoded String Text Near Relevant Table
        find_string = souped.find("span", string=re.compile(text))
        if i != 2:
            # Navigated from nearby string to relevant table
            table_div = find_string.parent.next_sibling.next_sibling
            if (i == 1) & (j != 0): 
                table_div = find_string.parent.next_sibling.next_sibling.next_sibling
                dol = 12
                nodol = 10
            # Grabbed table information 
            table = table_div.find("table")
            if i == 0:
                dol = 8
                nodol = 6
            else:
                dol = 12
                nodol = 9
            data = table_scrape(table, dol, nodol)
        elif (i == 2) & (j != 0):
            table_div = find_string.parent.next_sibling.next_sibling.next_sibling
            table = table_div.find("table")
            data = get_share_repurchases(table)
        else:
            table_div = find_string.parent.next_sibling
            # Grabbed table information 
            table = table_div.find("table")
            data = get_share_repurchases(table)
        df = pd.DataFrame(data)
        match i:
            case 0:
                b_sheets.append(df)
            case 1:
                si.append(df)
            case 2:
                buybacks.append(df)
        

pending j: 0
pending i: 0
pending i: 1
pending i: 2
pending j: 1
pending i: 0
pending i: 1
pending i: 2
pending j: 2
pending i: 0
pending i: 1
pending i: 2


In [78]:
table = getTable(text)
trs = table.find_all("tr")
cell_indexes = {8: [2,6], 6: [1,4]}
appendable = {}
for tr in trs:
    tds = tr.find_all("td")
    if type(tds) != "bool": # Filters boolean rows which wouldn't have a len property
        ltds = len(tds)
        if (ltds == 6) | (ltds == 8): # rows with numeric values have specific number of tags
            for i in range(len(tds)):
                try: # converting values to integers only works on cells with numeric values
                    if i == 0:
                        title = tds[i].text # Row label always in first row in these tables
                        appendable[title] = []
                    if i in cell_indexes[ltds]:
                        amount = tds[i].text
                        amount = amount.split("\xa0")[0]
                        if amount[0] == "(":
                            amount = amount[1:-1]
                        amounts = amount.split(",")
                        amount = int("".join(amounts))
                        appendable[title] += [amount]
                except: # skip rows that don't contain relevant numeric data
                    try:
                        del appendable[title]
                    except:
                        continue
                    continue

appendable

{'Cash and cash equivalents': [32045, 41862],
 'Marketable securities': [26035, 23541],
 'Accounts receivable, net': [14505, 16169],
 'Prepaid expenses and other current assets': [3846, 3793],
 'Total current assets': [76431, 85365],
 'Non-marketable equity securities': [6207, 6141],
 'Property and equipment, net': [102959, 96587],
 'Operating lease right-of-use assets': [14058, 13294],
 'Goodwill': [20654, 20654],
 'Other assets': [9929, 7582],
 'Total assets': [230238, 229623],
 'Accounts payable': [3173, 4849],
 'Operating lease liabilities, current': [1917, 1623],
 'Accrued expenses and other current liabilities': [21914, 25488],
 'Total current liabilities': [27004, 31960],
 'Operating lease liabilities, non-current': [17685, 17226],
 'Long-term debt': [18389, 18385],
 'Long-term income taxes': [7897, 7514],
 'Other liabilities': [2500, 1370],
 'Total liabilities': [73475, 76455],
 'Additional paid-in capital': [78270, 73253],
 'Accumulated other comprehensive loss': [2695, 2155],

Unnamed: 0,Revenue,Cost of revenue,Research and development,Marketing and sales,General and administrative,Total costs and expenses,Income from operations,"Interest and other income (expense), net",Income before provision for income taxes,Provision for income taxes,Net income,Basic,Diluted
0,134902,25959,38483,12301,11408,88151,46751,677,47428,8330,39098,2574,2629
1,116609,25249,35338,15262,11816,87665,28944,125,28819,5619,23200,2687,2702
2,117929,22649,24655,14043,9829,71176,46753,531,47284,7914,39370,2815,2859


In [155]:
si[1]

In [102]:
# Hardcoded String Text Near Important Table
find_string = souped.find("span", string="CONSOLIDATED STATEMENTS OF STOCKHOLDERS' EQUITY ")

# Navigated from nearby string to relevant table
table_div = find_string.parent.next_sibling

# Grabbed table information 
table = table_div.find("table")