In [12]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from get_edgar_data import ParseXML


In [2]:
# download sec edgar text data since web crawler is forbidden
# from sec_edgar_downloader import Downloader

### Turn filings into dataframe

In [41]:
class ParseXML:
    @staticmethod
    def calc_transactionAmounts(xmlpath):
        """Calculate the total transaction amount in $ of a giving form 4 in XML"""
        xml = ET.parse(xmlpath).getroot()
        total = 0

        if xml is None:
            return total

        nonDerivativeTransactions = xml.findall(
            "./nonDerivativeTable/nonDerivativeTransaction"
        )

        for t in nonDerivativeTransactions:
            # D for disposed or A for acquired
            action = t.find(
                "./transactionAmounts/transactionAcquiredDisposedCode/value"
            ).text
            # number of shares disposed/acquired
            shares = t.find("./transactionAmounts/transactionShares/value").text
            # price
            priceRaw = t.find("./transactionAmounts/transactionPricePerShare/value")
            price = 0 if priceRaw is None else priceRaw.text
            # set prefix to -1 if derivatives were disposed. set prefix to 1 if derivates were acquired.
            prefix = -1 if action == "D" else 1
            # calculate transaction amount in $
            amount = prefix * float(shares) * float(price)
            total += amount

        return round(total, 2)

    @staticmethod
    def calc_transactionPricePerShare(xmlpath):
        """Calculate the avg transaction price per share in $ of a giving form 4 in XML"""
        xml = ET.parse(xmlpath).getroot()
        if xml is None:
            return np.nan

        nonDerivativeTransactions = xml.findall(
            "./nonDerivativeTable/nonDerivativeTransaction"
        )

        prices = []
        shares = []
        for t in nonDerivativeTransactions:
            action = t.find(
                "./transactionAmounts/transactionAcquiredDisposedCode/value"
            ).text
            shareRaw = t.find("./transactionAmounts/transactionShares/value").text
            priceRaw = t.find("./transactionAmounts/transactionPricePerShare/value")
            if not priceRaw is None:
                prices.append(float(priceRaw.text) * float(shareRaw))
                shares.append(float(shareRaw))

        if len(prices) > 0:
            return np.sum(prices) / np.sum(shares)
        else:
            return np.nan

    @staticmethod
    def calc_transactionShares(xmlpath):
        """Calculate the avg transaction price per share in $ of a giving form 4 in XML"""
        xml = ET.parse(xmlpath).getroot()

        if xml is None:
            return np.nan

        nonDerivativeTransactions = xml.findall(
            "./nonDerivativeTable/nonDerivativeTransaction"
        )

        shares = []
        for t in nonDerivativeTransactions:
            action = t.find(
                "./transactionAmounts/transactionAcquiredDisposedCode/value"
            ).text
            shareRaw = t.find("./transactionAmounts/transactionShares/value")
            prefix = -1 if action == "D" else 1
            if not shareRaw is None:
                shares.append(prefix * float(shareRaw.text))

        if len(shares) > 0:
            return np.sum(shares)
        else:
            return np.nan

    @staticmethod
    def calc_absTransactionShares(xmlpath):
        """Calculate the avg transaction price per share in $ of a giving form 4 in XML"""
        xml = ET.parse(xmlpath).getroot()
        if xml is None:
            return np.nan

        nonDerivativeTransactions = xml.findall(
            "./nonDerivativeTable/nonDerivativeTransaction"
        )

        shares = []
        for t in nonDerivativeTransactions:
            shareRaw = t.find("./transactionAmounts/transactionShares/value")
            if not shareRaw is None:
                shares.append(float(shareRaw.text))
        if len(shares) > 0:
            return np.sum(shares)
        else:
            return np.nan

In [23]:
def get_filed_date(filingDir):
    """
    return format: str, 20180109
    """
    import re
    with open(filingDir + "/full-submission.txt", "r") as f:
        content = f.read()
    date = re.search("FILED AS OF DATE:[\s]*[0-9]{8}", content).group()[-8:]
    return pd.Timestamp(date)

filingDir = "./sec-edgar-filings/AAP\\4\\0000921895-18-000108"
xmlpath = filingDir + "/filing-details.xml"
ParseXML.calc_transactionAmounts(xmlpath=xmlpath)

191.72

### Get EDGAR stats for AAPL

In [138]:
edgarDir = "./sec-edgar-filings/"
ticker = "AAP"
filingsDir = edgarDir + "/" + ticker + "/4/"
dates = []
transactionPricePerShare = []
transactionShares = []
absTransactionShares = []
transactionAmounts = []
filingName = []
for filingDir in os.listdir(filingsDir):
    dates.append(get_filed_date(filingsDir + "/" + filingDir))
    xmlpath = filingsDir + "/" + filingDir + "/filing-details.xml" 
    transactionPricePerShare.append(ParseXML.calc_transactionPricePerShare(xmlpath=xmlpath))
    transactionShares.append(ParseXML.calc_transactionShares(xmlpath=xmlpath))
    absTransactionShares.append(ParseXML.calc_absTransactionShares(xmlpath=xmlpath))
    transactionAmounts.append(ParseXML.calc_transactionAmounts(xmlpath=xmlpath))
    filingName.append(filingDir)

In [139]:
df_data = pd.DataFrame.from_dict({
    "date": dates,
    "transactionPricePerShare": transactionPricePerShare, 
    "transactionShares": transactionShares, 
    "absTransactionShares": absTransactionShares, 
    "transactionAmounts": transactionAmounts,
    "filingName": filingName
})
df_data = df_data.set_index('date')

In [140]:
df_data.to_csv(f"./edgar_data/{ticker}.csv")
df_stats = df_data.groupby(level=0).sum()
df_stats['numTransactions'] = df_data.groupby("date")['transactionAmounts'].count()
df_stats.to_csv(f"./stock_edgar_stats/{ticker}.csv")
df_stats.index = df_stats.index.strftime('%Y-%m-%d')
df_stock = pd.read_csv(f"./stock_data/{ticker}.csv", index_col=0)
df_stock.merge(df_stats, left_index=True, right_index=True, how='outer').to_csv(f"./bt_stock_data/{ticker}.csv")

In [137]:
df_stock.merge(df_stats, left_index=True, right_index=True, how='outer')

Unnamed: 0_level_0,ticker,open,high,low,close,volume,adj_factor,transactionPricePerShare,transactionShares,absTransactionShares,transactionAmounts,numTransactions
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-01-02,AAP,100.900002,107.930000,99.709999,106.089996,2624529.0,1.0,,,,,
2018-01-03,AAP,106.419998,107.660004,105.300003,107.050003,2086814.0,1.0,,,,,
2018-01-04,AAP,107.750000,111.528999,107.480003,111.000000,2104476.0,1.0,,,,,
2018-01-05,AAP,111.959999,113.150002,110.300003,112.180000,1996658.0,1.0,,,,,
2018-01-08,AAP,112.050003,112.290001,110.639999,111.389999,1257110.0,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27,AAP,233.369995,236.740005,232.210007,236.500000,347578.0,1.0,,,,,
2021-12-28,AAP,236.029999,238.990005,234.899994,238.130005,601197.0,1.0,,,,,
2021-12-29,AAP,238.940002,242.199997,238.365005,241.029999,510397.0,1.0,,,,,
2021-12-30,AAP,241.139999,241.990005,237.464996,237.520004,569530.0,1.0,,,,,
