In [3]:
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import requests
from xml.etree import ElementTree as ET
from io import StringIO

# 05-Compare holdings
The purpose of this document is to accept a list of securities by Series ID then return a list of how much they overlap.

Psudeo code:

- Import each filing
- Join on CUSIP (change column name to match security set.
- Add rows
- graph

<hr>
Import functions from previous

In [1]:
# From 02-NPORT function.ipynb
class download_NPORT:
    def __init__(self, link):
        r = requests.get(link)
        assert(r), r
        xml = ET.fromstring(r.text)
        
        # Remove namespace tags
        xml = ET.iterparse(StringIO(r.text))
        for _, el in xml:
            _, _, el.tag = el.tag.rpartition('}') # strip ns
        self.xml = xml.root
    def parseSecurities(self):
        secs = self.xml.find('formData').find('invstOrSecs')
        all_secs = []
        for s in secs:
            t_dict = {}
            for e in s:
                t_dict[e.tag] = e.text 
            all_secs += [t_dict]
        res = pd.DataFrame(all_secs)
                
        # Extra data cleaning to convert strings to numbers
        num_cols = ['valUSD','balance','pctVal']
        
        for c in num_cols:
            res[c] = res[c].astype(float)
        
        return res
    def parseFundInfo(self):
        fund_info = {}
        for e in list(self.xml.find('formData').find('fundInfo')):
            fund_info[e.tag] = e.text
        
        return fund_info
    
# From 03-EDGAR getall
## NOTE: Although not expressly later int he document, the functions call other functions.
## All are necessary.
def get_EDGAR_index(year,quarter):
    assert(int(year)>1933), 'Year before SEC.'
    assert(int(year)<=datetime.now().year), 'Year in future'
    assert(0<int(quarter)<5), 'Invalid quarter.'
    
    
    url = 'https://www.sec.gov/Archives/edgar/full-index/{:}/QTR{:}/master.idx'
    url = url.format(int(year),int(quarter))
    return pd.read_csv(url,sep='|',
                       skip_blank_lines=True,
                       skiprows=[0,1,2,3,4,5,6,7,8,10],
                       parse_dates=['Date Filed'])
def get_filings(CIK,page:int=0,n_records:int=100,type_=''):
    assert(type(page)==int),'Page must be an int.'
    assert(type(n_records==int)), 'n_records must be an int.'
    
    ed_url = 'https://www.sec.gov/cgi-bin/browse-edgar'
    r = requests.get(ed_url,params = dict(
        action='getcompany',
        CIK=CIK,
        count=n_records,
        start=page*n_records,
        type=type_
    ))
    assert(r),r
    
    try:
        soup = BeautifulSoup(r.text,'lxml')
        
        # Look if request is valid
        if "Invalid parameter" in [h.text for h in soup.findAll('h1')]:
            return 'Invalid parameter'
    
        # Get table of filings and parse
        filings = list(soup.find(class_="tableFile2").find_all('tr'))
        header = filings.pop(0)# Drop first row which only contains file headers.
        res = []
        
        if len(filings)>0:
            for f in filings:
                f = list(f.findAll('td'))
                t_dict = dict()
                t_dict['Filings'] = f[0].text
                t_dict['Format'] = f[1].find('a').get('href')
                t_dict['Filing Date'] = f[3].text
                t_dict['File/Film Number'] = f[4]
                res += [t_dict]
                
            res = pd.DataFrame(res)
            res['Filing Date'] = pd.to_datetime(res['Filing Date'])
            return res
        else:
            return ('No results.')
    except:
        print('ERROR PARSING!')
        return r
def get_all_pages(CIK,n_records:int=100,type_=''):
    

    valid = True
    page = 0
    
    res = pd.DataFrame()
    while valid:
        tdf = get_filings(CIK=CIK,
                          page=page,
                          n_records=n_records,
                          type_=type_)
        
        if(type(tdf)!=pd.DataFrame):
            valid = False
            break
        else:
            page+=1
            res = pd.concat([res,tdf])
        
    return res.reset_index(drop=True)
def get_most_recent(CIK:str,n_records:int=100,type_:str='NPORT-P')->download_NPORT:
    '''CIK: this should be a CIK or Series number for the fund. For companies with multiple funds (E.g. Schwab)
            using the CIK will result in the NPORT for the fund that was filed most recently. Multiple ticker symbols
            can trade under the same CIK, but will have unique Seires numbers. Different classes of the same fund
            will chase a Series number if they are based on the same investment series.
       n_records: specifies the number of records to pull for each EDGAR query.
            The options as of 6/8 are 40,60,80,and 100. This is included for flexibility 
            in case EDGAR changes allowed options.
        type_: is the string to pass to EDGAR regarding type. It should always be NPORT-P, but I am
            making this a variable in case the name changes in the future.
            
        Based on the series ID, this will return a download_NPORT object. Using this object, you can download
        a list of securities in the form of a pandas datafram using the command `parseSecurities()`. You can 
        get a dictionary of general fund infor using `parseFundInfo()`. You can get the XMLetree by accesing
        `xml`.
        '''
    #pull the given url of the most recent.
    recent = (
        get_all_pages(CIK,n_records=n_records,type_=type_)
            .sort_values('Filing Date',ascending=False)
            .head(1)['Format'].values[0]
    )
    
    # The URL by default is the HTML file. We need to modify it to get the XML data.
    ## Strip HTML part of link
    recent = recent[:recent.rfind('/')]
    
    ## Add information to access XML
    link = "https://www.sec.gov{:}/primary_doc.xml".format(recent)
    
    return download_NPORT(link)

In [32]:
secs = pd.read_excel('Persichitte 2020-06-08.xlsx',sheet_name='Summary')

secs.rename(columns={'Row Labels':'class_ticker','Sum of Mkt Value':'Amount'},inplace=True)
secs = secs.merge(
    pd.read_pickle('Investment Company Series and Class Information.p'),
    on='class_ticker',
    how='inner'
)
secs

Unnamed: 0,class_ticker,Amount,reporting_file_number,cik,entity_name,entity_org_type,series_id,series_name,class_id,class_name,address_1,city,zip_code,state,address_2
0,BLOK,406.0,811-23108,1633061,Amplify ETF Trust,30,S000061158,Amplify Transformational Data Sharing ETF,C000198131,Amplify Transformational Data Sharing ETF,310 S. HALE ST.,WHEATON,60187,IL,[NULL]
1,HLEMX,2111.38,811-07739,1018170,HARDING LOEVNER FUNDS INC,30,S000004201,Harding Loevner Emerging Markets Portfolio,C000011821,Advisor,400 CROSSING BLVD.,BRIDGEWATER,08807,NJ,FOURTH FLOOR
2,JERTX,2961.54,811-01879,277751,JANUS INVESTMENT FUND,30,S000025889,Janus Henderson Global Real Estate Fund,C000077601,Class T,151 DETROIT STREET,DENVER,80206,CO,[NULL]
3,MDYG,887.04,811-08839,1064642,SPDR SERIES TRUST,30,S000006987,SPDR(R) S & P 400 Mid Cap Growth ETF,C000019040,SPDR(R) S & P 400 Mid Cap Growth ETF,ONE LINCOLN STREET,BOSTON,02111,MA,[NULL]
4,MDYV,518.8,811-08839,1064642,SPDR SERIES TRUST,30,S000006988,SPDR(R) S & P 400 Mid Cap Value ETF,C000019041,SPDR(R) S & P 400 Mid Cap Value ETF,ONE LINCOLN STREET,BOSTON,02111,MA,[NULL]
5,MFAEX,1127.51,811-22449,1496998,AMERICAN FUNDS MORTGAGE FUND,30,S000030190,AMERICAN FUNDS MORTGAGE FUND,C000092906,Class F-1,6455 IRVINE CENTER DRIVE,IRVINE,92618,CA,[NULL]
6,PCLAX,2535.53,811-05028,810893,PIMCO FUNDS,30,S000028928,PIMCO CommoditiesPLUS Strategy Fund,C000088650,Class A,650 NEWPORT CENTER DRIVE,NEWPORT BEACH,92660,CA,[NULL]
7,SCHC,138.41,811-22311,1454889,SCHWAB STRATEGIC TRUST,30,S000026638,Schwab International Small-Cap Equity ETF,C000079985,Schwab International Small-Cap Equity ETF,211 MAIN STREET,SAN FRANCISCO,94105,CA,[NULL]
8,SCHF,6253.05,811-22311,1454889,SCHWAB STRATEGIC TRUST,30,S000026637,Schwab International Equity ETF,C000079984,Schwab International Equity ETF,211 MAIN STREET,SAN FRANCISCO,94105,CA,[NULL]
9,SCHG,9959.96,811-22311,1454889,SCHWAB STRATEGIC TRUST,30,S000026633,Schwab U.S. Large-Cap Growth ETF,C000079980,Schwab U.S. Large-Cap Growth ETF,211 MAIN STREET,SAN FRANCISCO,94105,CA,[NULL]


In [38]:
def aggregate(df:pd.DataFrame,
              sid_column:str,
              amount_column:str,
              n_records:int=100,
              type_:str='NPORT-P'):
    tdf = df[[amount_column,sid_column]]
    
    result = pd.DataFrame(columns=['cusip'])
        
    for i in tdf.index:
        sid = tdf.iloc[i][sid_column]
        print('------------------\n',sid)
        
        try:
            NPORT = get_most_recent(sid).parseSecurities()
            
            # Replace 'N/A' values
            mask = (NPORT['cusip']=='N/A')
            NPORT.loc[mask,'cusip'] = NPORT.loc[mask,'cusip']
            
            # Add column for merge with absolute values
            NPORT[sid] = NPORT['pctVal'] * tdf.iloc[i][amount_column]
            result = result.merge(NPORT[['cusip',sid]],how='outer')
        except:
            print('Unable to read',sid)
    return result
df = aggregate(secs,amount_column='Amount',sid_column='series_id')

------------------
 S000061158
Unable to read S000061158
------------------
 S000004201
------------------
 S000025889
------------------
 S000006987
------------------
 S000006988
------------------
 S000030190
------------------
 S000028928
------------------
 S000026638
------------------
 S000026637
Unable to read S000026637
------------------
 S000026633
Unable to read S000026633
------------------
 S000030518
Unable to read S000030518
------------------
 S000026635
Unable to read S000026635
------------------
 S000026634
Unable to read S000026634
------------------
 S000006990
Unable to read S000006990
------------------
 S000006974
Unable to read S000006974
------------------
 S000005911
Unable to read S000005911
------------------
 S000005912
Unable to read S000005912
------------------
 S000029442
Unable to read S000029442
------------------
 S000012902
Unable to read S000012902


In [17]:
foo = get_most_recent('S000049990')
foo = foo.parseSecurities()

In [19]:
foo[foo['cusip']=='N/A']

Unnamed: 0,name,lei,title,cusip,identifiers,balance,units,curCd,valUSD,pctVal,payoffProfile,assetCat,issuerCat,invCountry,isRestrictedSec,fairValLevel,debtSec,securityLending
3,"JPMorgan Prime Money Market Fund, IM Shares",,"JPMorgan Prime Money Market Fund, IM Shares",,\n,181602.27,NS,USD,181693.07,0.678701,Long,STIV,RF,US,N,1,,\n


In [22]:
foo['pctVal'] = foo['pctVal']*50

In [23]:
foo['pctVal'] 

0      12.643159
1      20.268659
2      62.224303
3      33.935033
4      14.160225
         ...    
129    12.166371
130    19.932897
131    12.651321
132    24.683198
133    47.398174
Name: pctVal, Length: 134, dtype: float64

In [39]:
df

Unnamed: 0,cusip,S000004201,S000025889,S000006987,S000006988,S000030190,S000028928,S000026638
0,69343P105,6446.503121,,,,,,
1,000000000,2769.085858,,139.673959,98.48883,,-2.584973,12.414424
2,000000000,2769.085858,,139.673959,98.48883,,-2.584973,27.967682
3,000000000,2769.085858,,139.673959,98.48883,,-2.584973,81.409419
4,000000000,2769.085858,,139.673959,98.48883,,-2.584973,19.129190
...,...,...,...,...,...,...,...,...
157735865,135086106,,,,,,,13.226567
157735866,31890B103,,,,,,,16.201284
157735867,949921126,,,,,,,255.966755
157735868,53227R106,,,,,,,3.541518
