In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as souper
import pandas as pd
import re
import time

### Setup Functions to Help With Scraping
-----------------------------------------

In [3]:
def tableScrape(table, dollar, no_dollar):
    '''
    Function to scrape SEC tables for Meta, Inc. Expense Reports

    Parameters: 
            table: html text to scrape (string)
            dollar: number of cells in rows with dollar signs (int)
            no_dollar: number of cells in rows with no dollar signs (int)

    Returns:
            ret_dict: dictionary containing relevant table data (dictionary obj)
    '''
    
    # Select index values for rows with a dollar sign vs rows without
    match dollar:
        case 8:
            d = [2,6]
        case 12:
            d = [2,6,10]
    match no_dollar:
        case 6:
            nd = [1,4]
        case 9:
            nd = [1,4,7]
        case 10:
            nd = [5, 8]
    cell_indexes = {dollar: d, no_dollar: nd}    
    # Get relevant rows from table
    trs = table.find_all("tr")
    ret_dict = {}
    for tr in trs:
        tds = tr.find_all("td")
        if type(tds) != "bool": # Filters boolean rows which wouldn't have a len property
            ltds = len(tds)
            if (ltds == no_dollar) | (ltds == dollar): # rows with numeric values have specific number of tags
                for i in range(len(tds)):
                    try: # converting values to integers only works on cells with numeric values
                        if i == 0:
                            title = tds[i].text # Row label always in first row in these tables
                            ret_dict[title] = []
                        if i in cell_indexes[ltds]:
                            amount = tds[i].text
                            amount = amount.split("\xa0")[0]
                            if amount[0] == "(":
                                amount = amount[1:-1]
                            amounts = amount.split(",")
                            amount = int("".join(amounts))
                            ret_dict[title] += [amount]
                    except: # skip rows that don't contain relevant numeric data
                        try:
                            del appendable[title]
                        except:
                            continue
                        continue
    return ret_dict

def getShareRepurchases(table):
    '''
    Function specifically to get share repurchase data

    Parameters:
            table: html text to scrape (string)

    Returns:
            data: dictionary containing share repurchase data (dictionary obj)
    '''
    
    # Get Share Repurchase Data
    trs = table.find_all("tr")
    data = {"Share repurchases": []}
    for tr in trs:
        tds = tr.find_all("td")
        if tds[0].text == "Share repurchases": # isolates only Share repurchase row
            shares = [int(tds[1].text[1:-1])]
            data["Share repurchases"] += shares
    return data

def specifyReportType(report):
    '''
    Function to get values to use for proper report scraping

    Arguments:
        report: the type of report being scraped (string)

    Retuns:
        vals: dictionary of values to reference for tableScrape (dictionary obj)
    '''
    if report == "10-Q":
        dollar = "placeholder"
        return {dollar: d, no_dollar: nd}
    return

def pagePrep():
    '''
    Navigates to SEC Report Filings Page for Meta, Inc. and Lists Reports
    '''
    
    base_url = "https://www.sec.gov/edgar/browse/?CIK=1326801&owner=exclude" # SEC reports for Meta, Inc.
    browser.visit(base_url)
    parent_card_div = browser.find_by_id('filingsStart')
    all_card_divs = parent_card_div.find_by_css("div[class='card']")
    third_div = all_card_divs[3] # div card for quarterly and annual reports
    third_div.click()
    third_div_child = third_div.find_by_tag("div").first
    third_div_child.find_by_tag("button").first.click() # clicks button to list reports
    time.sleep(1) # timeout so that list can load before further actions taken
    return

## Gather Necessary Data
------------------------

In [4]:
# Import Reports list (most recent report 10-Q 2024-06-30)
reports_name_list = pd.read_csv("./Resources/EDGAR_landing_page_meta.csv")
quarterly_reports = reports_name_list[reports_name_list["Form type"] == "10-Q"]
annual_reports = reports_name_list[reports_name_list["Form type"] == "10-K"]
recent_quarterly = quarterly_reports.sort_values("Reporting date", ascending=False)[:3]
recent_quarterly

Unnamed: 0,Form type,Form description,Filing date,Reporting date
0,10-Q,Quarterly report [Sections 13 or 15(d)],2024-08-01,2024-06-30
1,10-Q,Quarterly report [Sections 13 or 15(d)],2024-04-25,2024-03-31
3,10-Q,Quarterly report [Sections 13 or 15(d)],2023-10-26,2023-09-30


In [5]:
# Go to reprots listing page for Meta, Inc.
browser = Browser("chrome")

In [287]:
pagePrep()
browser.links.find_by_text(recent_quarterly['Form description'][0]).first.click()
browser.windows.current = browser.windows[1] # navigate to newly opened window

# Change XML Report into basic HTML page
clicked_url = browser.url
url_parts = clicked_url.split("/ix?doc=")
clean_url = "".join(url_parts)
browser.visit(clean_url)

'https://www.sec.gov/ix?doc=/Archives/edgar/data/0001326801/000132680124000069/meta-20240630.htm'

In [59]:
urls = ["https://www.sec.gov/Archives/edgar/data/1326801/000132680124000012/meta-20231231.htm",
        "https://www.sec.gov/Archives/edgar/data/1326801/000132680124000069/meta-20240630.htm",
        "https://www.sec.gov/Archives/edgar/data/1326801/000132680124000049/meta-20240331.htm"]

table_texts = ["CONSOLIDATED BALANCE SHEETS",
               "CONSOLIDATED STATEMENTS OF INCOME",
              "CONSOLIDATED STATEMENTS OF STOCKHOLDERS' EQUITY"]

In [120]:
browser.visit(urls[0])
html = browser.html
souped = souper(html, "html.parser")


In [127]:
find_string = souped.find("span", string=re.compile(table_texts[0]))
table_div = find_string.parent.next_sibling.next_sibling
table = table_div.find("table")
trs = table.find_all("tr")
cell_indexes = {8: [2,6], 6: [1,4]}
appendable = {}
for tr in trs:
    tds = tr.find_all("td")
    if type(tds) != "bool": # Filters boolean rows which wouldn't have a len property
        ltds = len(tds)
        if (ltds == 6) | (ltds == 8): # rows with numeric values have specific number of tags
            for i in range(len(tds)):
                try: # converting values to integers only works on cells with numeric values
                    if i == 0:
                        title = tds[i].text # Row label always in first row in these tables
                        appendable[title] = []
                    if i in cell_indexes[ltds]:
                        amount = tds[i].text
                        amount = amount.split("\xa0")[0]
                        if amount[0] == "(":
                            amount = amount[1:-1]
                        amounts = amount.split(",")
                        amount = int("".join(amounts))
                        appendable[title] += [amount]
                except: # skip rows that don't contain relevant numeric data
                    try:
                        del appendable[title]
                    except:
                        continue
                    continue

appendable

{'Cash and cash equivalents': [41862, 14681],
 'Marketable securities': [23541, 26057],
 'Accounts receivable, net': [16169, 13466],
 'Prepaid expenses and other current assets': [3793, 5345],
 'Total current assets': [85365, 59549],
 'Non-marketable equity securities': [6141, 6201],
 'Property and equipment, net': [96587, 79518],
 'Operating lease right-of-use assets': [13294, 12673],
 'Intangible assets, net': [788, 897],
 'Goodwill': [20654, 20306],
 'Other assets': [6794, 6583],
 'Total assets': [229623, 185727],
 'Accounts payable': [4849, 4990],
 'Partners payable': [863, 1117],
 'Operating lease liabilities, current': [1623, 1367],
 'Accrued expenses and other current liabilities': [24625, 19552],
 'Total current liabilities': [31960, 27026],
 'Operating lease liabilities, non-current': [17226, 15301],
 'Long-term debt': [18385, 9923],
 'Other liabilities': [8884, 7764],
 'Total liabilities': [76455, 60014],
 'Additional paid-in capital': [73253, 64444],
 'Accumulated other comp

In [149]:
# Scrape and Sort Data From Finance Reports
b_sheets = []
si = []
buybacks = []
for j, url in enumerate(urls):
    print(f"pending j: {j}")
    browser.visit(url)
    time.sleep(1) # allow 1 second to load webpages before attempting to scrape tags
    html = browser.html
    souped = souper(html, "html.parser")
    for i, text in enumerate(table_texts):
        print(f"pending i: {i}")
        # Hardcoded String Text Near Relevant Table
        find_string = souped.find("span", string=re.compile(text))
        if i != 2:
            # Navigated from nearby string to relevant table
            table_div = find_string.parent.next_sibling.next_sibling
            if (i == 1) & (j != 0): 
                table_div = find_string.parent.next_sibling.next_sibling.next_sibling
                dol = 12
                nodol = 10
            # Grabbed table information 
            table = table_div.find("table")
            if i == 0:
                dol = 8
                nodol = 6
            else:
                dol = 12
                nodol = 9
            data = table_scrape(table, dol, nodol)
        elif (i == 2) & (j != 0):
            table_div = find_string.parent.next_sibling.next_sibling.next_sibling
            table = table_div.find("table")
            data = get_share_repurchases(table)
        else:
            table_div = find_string.parent.next_sibling
            # Grabbed table information 
            table = table_div.find("table")
            data = get_share_repurchases(table)
        df = pd.DataFrame(data)
        match i:
            case 0:
                b_sheets.append(df)
            case 1:
                si.append(df)
            case 2:
                buybacks.append(df)
        

pending j: 0
pending i: 0
pending i: 1
pending i: 2
pending j: 1
pending i: 0
pending i: 1
pending i: 2
pending j: 2
pending i: 0
pending i: 1
pending i: 2


In [153]:
all_dfs = [b_sheets, si, buybacks]
organized = []
for dfs in all_dfs:
    dfs = pd.concat(dfs, axis=0)
    organized.append(dfs)
organized[1]

Unnamed: 0,Revenue,Cost of revenue,Research and development,Marketing and sales,General and administrative,Total costs and expenses,Income from operations,"Interest and other income (expense), net",Income before provision for income taxes,Provision for income taxes,Net income,Basic,Diluted
0,134902,25959,38483,12301,11408,88151,46751,677,47428,8330,39098,2574,2629
1,116609,25249,35338,15262,11816,87665,28944,125,28819,5619,23200,2687,2702
2,117929,22649,24655,14043,9829,71176,46753,531,47284,7914,39370,2815,2859


In [155]:
si[1]

In [102]:
# Hardcoded String Text Near Important Table
find_string = souped.find("span", string="CONSOLIDATED STATEMENTS OF STOCKHOLDERS' EQUITY ")

# Navigated from nearby string to relevant table
table_div = find_string.parent.next_sibling

# Grabbed table information 
table = table_div.find("table")

Unnamed: 0,test,test2
0,value,value2
1,row2,row2v


In [101]:
table