In [16]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from get_edgar_data import ParseXML
import os
from tqdm import tqdm

### Turn filings into dataframe

In [36]:
class ParseXML:
    @staticmethod
    def calc_transactionAmounts(xmlpath):
        """Calculate the total transaction amount in $ of a giving form 4 in XML"""
        xml = ET.parse(xmlpath).getroot()
        total = 0

        if xml is None:
            return total

        nonDerivativeTransactions = xml.findall(
            "./nonDerivativeTable/nonDerivativeTransaction"
        )

        for t in nonDerivativeTransactions:
            # D for disposed or A for acquired
            action = t.find(
                "./transactionAmounts/transactionAcquiredDisposedCode/value"
            ).text
            # number of shares disposed/acquired
            shares = t.find("./transactionAmounts/transactionShares/value").text
            # price
            priceRaw = t.find("./transactionAmounts/transactionPricePerShare/value")
            price = 0 if priceRaw is None else priceRaw.text
            # set prefix to -1 if derivatives were disposed. set prefix to 1 if derivates were acquired.
            prefix = -1 if action == "D" else 1
            # calculate transaction amount in $
            amount = prefix * float(shares) * float(price)
            total += amount

        return round(total, 2)

    @staticmethod
    def calc_transactionPricePerShare(xmlpath):
        """Calculate the avg transaction price per share in $ of a giving form 4 in XML"""
        xml = ET.parse(xmlpath).getroot()
        if xml is None:
            return np.nan

        nonDerivativeTransactions = xml.findall(
            "./nonDerivativeTable/nonDerivativeTransaction"
        )

        prices = []
        shares = []
        for t in nonDerivativeTransactions:
            action = t.find(
                "./transactionAmounts/transactionAcquiredDisposedCode/value"
            ).text
            shareRaw = t.find("./transactionAmounts/transactionShares/value").text
            priceRaw = t.find("./transactionAmounts/transactionPricePerShare/value")
            if not priceRaw is None:
                prices.append(float(priceRaw.text) * float(shareRaw))
                shares.append(float(shareRaw))

        if (len(prices) > 0) and (len(shares) >0) and (np.sum(shares)!=0):
            return np.sum(prices) / np.sum(shares)
        else:
            return np.nan

    @staticmethod
    def calc_transactionShares(xmlpath):
        """Calculate the avg transaction price per share in $ of a giving form 4 in XML"""
        xml = ET.parse(xmlpath).getroot()

        if xml is None:
            return np.nan

        nonDerivativeTransactions = xml.findall(
            "./nonDerivativeTable/nonDerivativeTransaction"
        )

        shares = []
        for t in nonDerivativeTransactions:
            action = t.find(
                "./transactionAmounts/transactionAcquiredDisposedCode/value"
            ).text
            shareRaw = t.find("./transactionAmounts/transactionShares/value")
            prefix = -1 if action == "D" else 1
            if not shareRaw is None:
                shares.append(prefix * float(shareRaw.text))

        if len(shares) > 0:
            return np.sum(shares)
        else:
            return np.nan

    @staticmethod
    def calc_absTransactionShares(xmlpath):
        """Calculate the avg transaction price per share in $ of a giving form 4 in XML"""
        xml = ET.parse(xmlpath).getroot()
        if xml is None:
            return np.nan

        nonDerivativeTransactions = xml.findall(
            "./nonDerivativeTable/nonDerivativeTransaction"
        )

        shares = []
        for t in nonDerivativeTransactions:
            shareRaw = t.find("./transactionAmounts/transactionShares/value")
            if not shareRaw is None:
                shares.append(float(shareRaw.text))
        if len(shares) > 0:
            return np.sum(shares)
        else:
            return np.nan

In [37]:
def get_filed_date(filingDir):
    """
    return format: str, 20180109
    """
    import re
    with open(filingDir + "/full-submission.txt", "r") as f:
        content = f.read()
    date = re.search("FILED AS OF DATE:[\s]*[0-9]{8}", content).group()[-8:]
    return pd.Timestamp(date)

filingDir = "./sec-edgar-filings/AAP\\4\\0000921895-18-000108"
xmlpath = filingDir + "/filing-details.xml"
ParseXML.calc_transactionAmounts(xmlpath=xmlpath)

191.72

### Get EDGAR stats for AAPL

In [38]:
edgarDir = "./sec-edgar-filings/"
ticker = "BIO"
filingsDir = edgarDir + "/" + ticker + "/4/"
dates = []
transactionPricePerShare = []
transactionShares = []
absTransactionShares = []
transactionAmounts = []
filingName = []
for filingDir in os.listdir(filingsDir):
    dates.append(get_filed_date(filingsDir + "/" + filingDir))
    xmlpath = filingsDir + "/" + filingDir + "/filing-details.xml" 
    transactionPricePerShare.append(ParseXML.calc_transactionPricePerShare(xmlpath=xmlpath))
    transactionShares.append(ParseXML.calc_transactionShares(xmlpath=xmlpath))
    absTransactionShares.append(ParseXML.calc_absTransactionShares(xmlpath=xmlpath))
    transactionAmounts.append(ParseXML.calc_transactionAmounts(xmlpath=xmlpath))
    filingName.append(filingDir)

In [45]:
df_stock = pd.read_csv(f"./stock_data/{ticker}.csv", index_col=0)
df_stock.index[0]

'2018-01-02'

In [46]:
df_data = pd.DataFrame.from_dict({
    "date": dates,
    "transactionPricePerShare": transactionPricePerShare, 
    "transactionShares": transactionShares, 
    "absTransactionShares": absTransactionShares, 
    "transactionAmounts": transactionAmounts,
    "filingName": filingName
})
df_data = df_data.set_index('date')

In [47]:
# df_data.to_csv(f"./edgar_data/{ticker}.csv")
df_stats = df_data.groupby(level=0).sum()
df_stats['numTransactions'] = df_data.groupby("date")['transactionAmounts'].count()
# df_stats.to_csv(f"./stock_edgar_stats/{ticker}.csv")
df_stats.index = df_stats.index.strftime('%Y-%m-%d')
df_stock = pd.read_csv(f"./stock_data/{ticker}.csv", index_col=0)
# df_stock.merge(df_stats, left_index=True, right_index=True, how='outer').to_csv(f"./bt_stock_data/{ticker}.csv")

In [59]:
df_stock.dropna().loc['2018-01-02']

ticker               BIO
open          238.669998
high          241.979996
low           238.229996
close         241.419998
volume          169473.0
adj_factor           1.0
Name: 2018-01-02, dtype: object

In [51]:
df_stats

Unnamed: 0_level_0,transactionPricePerShare,transactionShares,absTransactionShares,transactionAmounts,numTransactions
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-04,0.000000,5000.0,5000.0,0.00,1
2018-01-05,227.094948,-1000.0,15500.0,-1392190.53,2
2018-01-09,824.393472,-13552.0,16164.0,-2570250.15,6
2018-01-12,0.000000,0.0,0.0,0.00,4
2018-02-01,246.398547,156037.0,240561.0,-6700676.66,8
...,...,...,...,...,...
2021-12-20,10.190000,22329.0,22329.0,227532.51,7
2021-12-21,46.330500,47533.0,107033.0,-459444.12,4
2021-12-22,12.220000,4300.0,4300.0,52546.00,1
2021-12-23,32.179800,-94843.0,94843.0,-1221855.16,3


In [54]:
df_stock.merge(df_stats, left_index=True, right_index=True, how='outer')

Unnamed: 0_level_0,ticker,open,high,low,close,volume,adj_factor,transactionPricePerShare,transactionShares,absTransactionShares,transactionAmounts,numTransactions
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-01-02,BIO,,242.949997,241.300003,-242.125000,0.0,1.0,,,,,
2018-01-02,BIO,238.669998,241.979996,238.229996,241.419998,169473.0,1.0,,,,,
2018-01-03,BIO,,253.500000,249.300003,-251.399994,0.0,1.0,,,,,
2018-01-03,BIO,242.339996,251.850006,242.339996,250.470001,233223.0,1.0,,,,,
2018-01-04,BIO,,251.199997,248.350006,-249.774994,0.0,1.0,0.0,5000.0,5000.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-29,BIO,747.500000,753.650024,741.380005,750.739990,62829.0,1.0,,,,,
2021-12-30,BIO,,761.229980,758.010010,-759.619995,0.0,1.0,,,,,
2021-12-30,BIO,755.239990,761.525024,750.280029,758.650024,57620.0,1.0,,,,,
2021-12-31,BIO,,787.969971,754.719971,-771.344971,0.0,1.0,,,,,


In [10]:
df_stock.describe()

Unnamed: 0,open,high,low,close,volume,adj_factor,transactionPricePerShare,transactionShares,absTransactionShares,transactionAmounts,numTransactions
count,1008.0,1008.0,1008.0,1008.0,1008.0,1008.0,111.0,111.0,111.0,111.0,111.0
mean,159.765734,161.727519,157.597685,159.717827,1007191.0,1.0,406.013709,-385632.4,405630.8,-589692.0,3.243243
std,31.579776,31.589325,31.606835,31.587788,520592.2,0.0,557.824709,1711653.0,1707488.0,7750948.0,3.464669
min,74.970001,79.940002,71.328598,75.029999,270092.0,1.0,0.0,-14977890.0,2.214,-81216400.0,1.0
25%,140.985001,142.817501,139.399994,140.672493,686039.0,1.0,0.0,-23526.5,518.485,0.0,1.0
50%,157.335007,159.133698,155.449997,157.315002,900895.0,1.0,155.78,20.156,3941.0,0.0,1.0
75%,170.3825,171.570007,168.162502,170.064999,1205077.0,1.0,485.79,786.23,46296.0,71621.93,4.0
max,241.139999,243.050003,239.5,241.910004,5911983.0,1.0,1899.45,375835.0,14977890.0,3105715.0,12.0


In [60]:
def gen_bt_stock_data(ticker, edgarDir="./sec-edgar-filings/", bt_stock_data_dir="./bt_stock_data/"):
    filingsDir = edgarDir + "/" + ticker + "/4/"
    dates = []
    transactionPricePerShare = []
    transactionShares = []
    absTransactionShares = []
    transactionAmounts = []
    filingName = []
    for filingDir in os.listdir(filingsDir):
        dates.append(get_filed_date(filingsDir + "/" + filingDir))
        xmlpath = filingsDir + "/" + filingDir + "/filing-details.xml" 
        if os.path.exists(xmlpath):
            transactionPricePerShare.append(ParseXML.calc_transactionPricePerShare(xmlpath=xmlpath))
            transactionShares.append(ParseXML.calc_transactionShares(xmlpath=xmlpath))
            absTransactionShares.append(ParseXML.calc_absTransactionShares(xmlpath=xmlpath))
            transactionAmounts.append(ParseXML.calc_transactionAmounts(xmlpath=xmlpath))
            filingName.append(filingDir)

    df_data = pd.DataFrame.from_dict({
        "date": dates,
        "transactionPricePerShare": transactionPricePerShare, 
        "transactionShares": transactionShares, 
        "absTransactionShares": absTransactionShares, 
        "transactionAmounts": transactionAmounts,
        "filingName": filingName
    })
    df_data = df_data.set_index('date')

    df_data.to_csv(f"./edgar_data/{ticker}.csv")
    df_stats = df_data.groupby(level=0).sum()
    df_stats['numTransactions'] = df_data.groupby("date")['transactionAmounts'].count()
    df_stats.to_csv(f"./stock_edgar_stats/{ticker}.csv")
    df_stats.index = df_stats.index.strftime('%Y-%m-%d')

    df_stock = pd.read_csv(f"./stock_data/{ticker}.csv", index_col=0)
    df_stock.dropna(inplace=True)
    df_stock.merge(df_stats, left_index=True, right_index=True, how='outer').to_csv(f"./bt_stock_data/{ticker}.csv")
    df_stock = df_stock.merge(df_stats, left_index=True, right_index=True, how='outer')
    df_stock.to_csv(f"{bt_stock_data_dir}/{ticker}.csv")



In [61]:
for file in tqdm(os.listdir("./stock_data/")):
    ticker = file[:-4]
    try:
        gen_bt_stock_data(ticker)
    except Exception as e:
        print(ticker, e)

  5%|▌         | 26/485 [01:10<19:28,  2.55s/it]

AMAT [Errno 2] No such file or directory: './sec-edgar-filings//AMAT/4//0001127602-21-030292/filing-details.xml'


  6%|▌         | 29/485 [01:13<13:34,  1.79s/it]

AME [Errno 2] No such file or directory: './sec-edgar-filings//AME/4//0001127602-19-022544/filing-details.xml'


 30%|███       | 147/485 [06:01<15:36,  2.77s/it]

EL [Errno 2] No such file or directory: './sec-edgar-filings//EL/4//0001001250-21-000190/filing-details.xml'


 33%|███▎      | 160/485 [06:27<13:29,  2.49s/it]

ETSY [Errno 2] No such file or directory: './sec-edgar-filings//ETSY/4//0001209191-21-040705/filing-details.xml'
EVRG [Errno 2] No such file or directory: './sec-edgar-filings//EVRG/4//0001127602-18-021458/filing-details.xml'


 38%|███▊      | 182/485 [07:12<13:22,  2.65s/it]

FRC [WinError 3] The system cannot find the path specified: './sec-edgar-filings//FRC/4/'


 40%|███▉      | 193/485 [07:37<12:48,  2.63s/it]

GNRC [WinError 3] The system cannot find the path specified: './sec-edgar-filings//GNRC/4/'


 40%|████      | 195/485 [07:42<12:06,  2.50s/it]

GOOGL [Errno 2] No such file or directory: './sec-edgar-filings//GOOGL/4//0001209191-21-048409/filing-details.xml'


 41%|████      | 199/485 [07:50<10:44,  2.25s/it]

GS [Errno 2] No such file or directory: './sec-edgar-filings//GS/4//0000769993-20-000044/filing-details.xml'


 49%|████▉     | 237/485 [09:25<06:49,  1.65s/it]

IT [Errno 2] No such file or directory: './sec-edgar-filings//IT/4//0001127602-18-005545/filing-details.xml'


 61%|██████    | 294/485 [11:55<05:08,  1.62s/it]

MO [Errno 2] No such file or directory: './sec-edgar-filings//MO/4//0001567619-21-018098/filing-details.xml'


 64%|██████▍   | 312/485 [12:38<05:43,  1.99s/it]

NEE [Errno 2] No such file or directory: './sec-edgar-filings//NEE/4//0001062993-21-013379/filing-details.xml'


 71%|███████   | 345/485 [14:20<07:16,  3.12s/it]

PCG [Errno 2] No such file or directory: './sec-edgar-filings//PCG/4//0001127602-18-014046/filing-details.xml'


 74%|███████▍  | 359/485 [15:48<08:19,  3.97s/it]

PM [Errno 2] No such file or directory: './sec-edgar-filings//PM/4//0001567619-21-007444/filing-details.xml'


 80%|████████  | 390/485 [17:04<03:54,  2.47s/it]

SBNY [WinError 3] The system cannot find the path specified: './sec-edgar-filings//SBNY/4/'


 98%|█████████▊| 473/485 [21:05<00:24,  2.01s/it]

WTW [WinError 3] The system cannot find the path specified: './sec-edgar-filings//WTW/4/'


100%|██████████| 485/485 [21:39<00:00,  2.68s/it]


In [78]:
df0 = pd.read_csv(f"./bt_stock_data/AAL.csv", index_col=0)
bad_tickers = []
for file in tqdm(os.listdir("./bt_stock_data/")):
       df = pd.read_csv(f"./bt_stock_data/{file}", index_col=0)
       if len(df.index) != len(df0.index):
              bad_tickers.append(file[:-4])

100%|██████████| 469/469 [00:01<00:00, 373.09it/s]


In [80]:
len(bad_tickers)

163