In [1]:
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import requests
from xml.etree import ElementTree as ET
from io import StringIO

# 05-Compare holdings
The purpose of this document is to accept a list of securities by Series ID then return a list of how much they overlap.

Psudeo code:

- Import each filing
- Join on CUSIP (change column name to match security set.
- Add rows
- graph

<hr>
Import functions from previous

In [2]:
# From 02-NPORT function.ipynb
class download_NPORT:
    def __init__(self, link):
        r = requests.get(link)
        assert(r), r
        xml = ET.fromstring(r.text)
        
        # Remove namespace tags
        xml = ET.iterparse(StringIO(r.text))
        for _, el in xml:
            _, _, el.tag = el.tag.rpartition('}') # strip ns
        self.xml = xml.root
    def parseSecurities(self):
        secs = self.xml.find('formData').find('invstOrSecs')
        all_secs = []
        for s in secs:
            t_dict = {}
            for e in s:
                t_dict[e.tag] = e.text 
            all_secs += [t_dict]
        res = pd.DataFrame(all_secs)
                
        # Extra data cleaning to convert strings to numbers
        num_cols = ['valUSD','balance','pctVal']
        
        for c in num_cols:
            res[c] = res[c].astype(float)
        
        return res
    def parseFundInfo(self):
        fund_info = {}
        for e in list(self.xml.find('formData').find('fundInfo')):
            fund_info[e.tag] = e.text
        
        return fund_info
    
# From 03-EDGAR getall
## NOTE: Although not expressly later int he document, the functions call other functions.
## All are necessary.
def get_EDGAR_index(year,quarter):
    assert(int(year)>1933), 'Year before SEC.'
    assert(int(year)<=datetime.now().year), 'Year in future'
    assert(0<int(quarter)<5), 'Invalid quarter.'
    
    
    url = 'https://www.sec.gov/Archives/edgar/full-index/{:}/QTR{:}/master.idx'
    url = url.format(int(year),int(quarter))
    return pd.read_csv(url,sep='|',
                       skip_blank_lines=True,
                       skiprows=[0,1,2,3,4,5,6,7,8,10],
                       parse_dates=['Date Filed'])
def get_filings(CIK,page:int=0,n_records:int=100,type_=''):
    assert(type(page)==int),'Page must be an int.'
    assert(type(n_records==int)), 'n_records must be an int.'
    
    ed_url = 'https://www.sec.gov/cgi-bin/browse-edgar'
    r = requests.get(ed_url,params = dict(
        action='getcompany',
        CIK=CIK,
        count=n_records,
        start=page*n_records,
        type=type_
    ))
    assert(r),r
    
    try:
        soup = BeautifulSoup(r.text,'lxml')
        
        # Look if request is valid
        if "Invalid parameter" in [h.text for h in soup.findAll('h1')]:
            return 'Invalid parameter'
    
        # Get table of filings and parse
        filings = list(soup.find(class_="tableFile2").find_all('tr'))
        header = filings.pop(0)# Drop first row which only contains file headers.
        res = []
        
        if len(filings)>0:
            for f in filings:
                f = list(f.findAll('td'))
                t_dict = dict()
                t_dict['Filings'] = f[0].text
                t_dict['Format'] = f[1].find('a').get('href')
                t_dict['Filing Date'] = f[3].text
                t_dict['File/Film Number'] = f[4]
                res += [t_dict]
                
            res = pd.DataFrame(res)
            res['Filing Date'] = pd.to_datetime(res['Filing Date'])
            return res
        else:
            return ('No results.')
    except:
        print('ERROR PARSING!')
        return r
def get_all_pages(CIK,n_records:int=100,type_=''):
    

    valid = True
    page = 0
    
    res = pd.DataFrame()
    while valid:
        tdf = get_filings(CIK=CIK,
                          page=page,
                          n_records=n_records,
                          type_=type_)
        
        if(type(tdf)!=pd.DataFrame):
            valid = False
            break
        else:
            page+=1
            res = pd.concat([res,tdf])
        
    return res.reset_index(drop=True)
def get_most_recent(CIK:str,n_records:int=100,type_:str='NPORT-P')->download_NPORT:
    '''CIK: this should be a CIK or Series number for the fund. For companies with multiple funds (E.g. Schwab)
            using the CIK will result in the NPORT for the fund that was filed most recently. Multiple ticker symbols
            can trade under the same CIK, but will have unique Seires numbers. Different classes of the same fund
            will chase a Series number if they are based on the same investment series.
       n_records: specifies the number of records to pull for each EDGAR query.
            The options as of 6/8 are 40,60,80,and 100. This is included for flexibility 
            in case EDGAR changes allowed options.
        type_: is the string to pass to EDGAR regarding type. It should always be NPORT-P, but I am
            making this a variable in case the name changes in the future.
            
        Based on the series ID, this will return a download_NPORT object. Using this object, you can download
        a list of securities in the form of a pandas datafram using the command `parseSecurities()`. You can 
        get a dictionary of general fund infor using `parseFundInfo()`. You can get the XMLetree by accesing
        `xml`.
        '''
    #pull the given url of the most recent.
    recent = (
        get_all_pages(CIK,n_records=n_records,type_=type_)
            .sort_values('Filing Date',ascending=False)
            .head(1)['Format'].values[0]
    )
    
    # The URL by default is the HTML file. We need to modify it to get the XML data.
    ## Strip HTML part of link
    recent = recent[:recent.rfind('/')]
    
    ## Add information to access XML
    link = "https://www.sec.gov{:}/primary_doc.xml".format(recent)
    
    return download_NPORT(link)

In [3]:
secs = pd.read_excel('Persichitte 2020-06-08.xlsx',sheet_name='Summary')

secs.rename(columns={'Row Labels':'class_ticker','Sum of Mkt Value':'Amount'},inplace=True)
secs = secs.merge(
    pd.read_pickle('Investment Company Series and Class Information.p'),
    on='class_ticker',
    how='inner'
)
secs

Unnamed: 0,class_ticker,Amount,reporting_file_number,cik,entity_name,entity_org_type,series_id,series_name,class_id,class_name,address_1,city,zip_code,state,address_2
0,BLOK,406.0,811-23108,1633061,Amplify ETF Trust,30,S000061158,Amplify Transformational Data Sharing ETF,C000198131,Amplify Transformational Data Sharing ETF,310 S. HALE ST.,WHEATON,60187,IL,[NULL]
1,HLEMX,2111.38,811-07739,1018170,HARDING LOEVNER FUNDS INC,30,S000004201,Harding Loevner Emerging Markets Portfolio,C000011821,Advisor,400 CROSSING BLVD.,BRIDGEWATER,08807,NJ,FOURTH FLOOR
2,JERTX,2961.54,811-01879,277751,JANUS INVESTMENT FUND,30,S000025889,Janus Henderson Global Real Estate Fund,C000077601,Class T,151 DETROIT STREET,DENVER,80206,CO,[NULL]
3,MDYG,887.04,811-08839,1064642,SPDR SERIES TRUST,30,S000006987,SPDR(R) S & P 400 Mid Cap Growth ETF,C000019040,SPDR(R) S & P 400 Mid Cap Growth ETF,ONE LINCOLN STREET,BOSTON,02111,MA,[NULL]
4,MDYV,518.8,811-08839,1064642,SPDR SERIES TRUST,30,S000006988,SPDR(R) S & P 400 Mid Cap Value ETF,C000019041,SPDR(R) S & P 400 Mid Cap Value ETF,ONE LINCOLN STREET,BOSTON,02111,MA,[NULL]
5,MFAEX,1127.51,811-22449,1496998,AMERICAN FUNDS MORTGAGE FUND,30,S000030190,AMERICAN FUNDS MORTGAGE FUND,C000092906,Class F-1,6455 IRVINE CENTER DRIVE,IRVINE,92618,CA,[NULL]
6,PCLAX,2535.53,811-05028,810893,PIMCO FUNDS,30,S000028928,PIMCO CommoditiesPLUS Strategy Fund,C000088650,Class A,650 NEWPORT CENTER DRIVE,NEWPORT BEACH,92660,CA,[NULL]
7,SCHC,138.41,811-22311,1454889,SCHWAB STRATEGIC TRUST,30,S000026638,Schwab International Small-Cap Equity ETF,C000079985,Schwab International Small-Cap Equity ETF,211 MAIN STREET,SAN FRANCISCO,94105,CA,[NULL]
8,SCHF,6253.05,811-22311,1454889,SCHWAB STRATEGIC TRUST,30,S000026637,Schwab International Equity ETF,C000079984,Schwab International Equity ETF,211 MAIN STREET,SAN FRANCISCO,94105,CA,[NULL]
9,SCHG,9959.96,811-22311,1454889,SCHWAB STRATEGIC TRUST,30,S000026633,Schwab U.S. Large-Cap Growth ETF,C000079980,Schwab U.S. Large-Cap Growth ETF,211 MAIN STREET,SAN FRANCISCO,94105,CA,[NULL]


In [None]:
class aggregate_SID_holdings:
    def __init__(
        self,
        df:pd.DataFrame,
        sid_column:str,
        amount_column:str,
        n_records:int=100,
        type_:str='NPORT-P'
    ):
        print('Importing holdings by Series IDs')
        tdf = df[[sid_column,amount_column]]
        cusip = pd.DataFrame(columns=['cusip'])
        assetCat_count = pd.DataFrame(columns=['assetCat'])
        assetCat_amount = pd.DataFrame(columns=['assetCat'])
        
        for i in tdf.index:
            sid = tdf.iloc[i][sid_column]
            print('------------------\n',sid)
            
            try:
                NPORT = get_most_recent(sid).parseSecurities()
                
                # Add column for merge with absolute values
                NPORT[sid] = NPORT['pctVal'] * tdf.iloc[i][amount_column]
                cusip = cusip.merge(NPORT[['cusip',sid]],how='outer')
                
                # Replace 'N/A' values
                mask = (NPORT['cusip']=='N/A') | (NPORT['cusip']=='000000000')
                NPORT.loc[mask,'cusip'] = NPORT.loc[mask,'lei']
                
                # Replace where N/A is in 'lei'
                mask = (NPORT['cusip']=='N/A') | (NPORT['cusip']=='000000000')
                NPORT.loc[mask,'cusip'] = NPORT.loc[mask,'title']
                

            except:
                print('Unable to read securites for',sid)
                
            try:
                # Use the amounts calculated above to give summary stats by asset class.
                cats = NPORT.groupby('assetCat',as_index=False)[sid].agg(('count','sum'))
                
                assetCat_count = assetCat_count.merge(cats.rename(columns={'count':sid})[sid],
                                                on='assetCat', how='outer')
                assetCat_amount = assetCat_amount.merge(cats.rename(columns={'sum':sid})[sid],
                                                on='assetCat', how='outer')
            except:
                print('Unable to read asset categories for', sid)
                assert(i==0)
                
        # Store results in the class object
        self.cusip = cusip
        self.assetCat_count = assetCat_count
        self.assetCat_amount = assetCat_amount

        
        print('---------\nComplete\n--------')
holds = aggregate_SID_holdings(secs,amount_column='Amount',sid_column='series_id')

Importing holdings by Series IDs
------------------
 S000061158
Unable to read securites for S000061158
Unable to read asset categories for S000061158
------------------
 S000004201
------------------
 S000025889
------------------
 S000006987
------------------
 S000006988
------------------
 S000030190
------------------
 S000028928
------------------
 S000026638
------------------
 S000026637


In [37]:
foo = get_most_recent('S000004201').parseSecurities()
foo

Unnamed: 0,name,lei,title,cusip,identifiers,balance,units,curCd,valUSD,pctVal,payoffProfile,assetCat,issuerCat,invCountry,isRestrictedSec,fairValLevel,securityLending,currencyConditional
0,Oil Company Lukoil Pjsc,549300LCJ1UJXHYBWI24,Oil Company Lukoil Pjsc SPON ADR EACH REP 1 OR...,69343P105,\n,1.306829e+06,NS,USD,1.334664e+08,3.053218,Long,EC,CORP,RU,N,1,\n,
1,Midea Group Co Ltd,,Midea Group Co Ltd CNY1,000000000,\n,7.863138e+06,NS,,5.733031e+07,1.311505,Long,EC,CORP,CN,N,2,\n,
2,Trip Com Grp Ltd,2549004RMYDSRQL4KW41,Trip Com Grp Ltd SPON ADS EACH REP 0.125 ORD SHS,89677Q107,\n,6.167170e+05,NS,USD,1.981512e+07,0.453296,Long,EC,CORP,KY,N,1,\n,
3,"Baidu, Inc.",254900AL64IANZYI1E02,"Baidu, Inc. SPON ADS EACH REP 0.1 ORD SHS",056752108,\n,3.453580e+05,NS,USD,4.267243e+07,0.976187,Long,EC,CORP,KY,N,1,\n,
4,Autohome Inc,529900NYB6ZGZN3MVT56,Autohome Inc SPON ADS EACH REP 1 ORD SHS CL A,05278C107,\n,4.023640e+05,NS,USD,3.077280e+07,0.703968,Long,EC,CORP,KY,N,1,\n,
5,Wal-Mart De Mexico,,Wal-Mart De Mexico COM NPV,P98180188,\n,1.792030e+07,NS,,5.201918e+07,1.190006,Long,EC,CORP,MX,N,1,\n,
6,Bank of Georgia Group PLC,213800XKDG12NQG8VC53,Bank of Georgia Group PLC ORD GBP0.01,G0R1NA104,\n,5.671510e+05,NS,,1.131784e+07,0.258910,Long,EC,CORP,GB,N,2,\n,
7,Ping An Insurance (Group) Company of China Ltd.,529900M9MC28JLN35U89,Ping An Insurance (Group) Company of China Ltd...,Y69790106,\n,6.635500e+06,NS,,7.506848e+07,1.717289,Long,EC,CORP,CN,N,2,\n,
8,Tata Consultancy S,335800ZJKU9GPQRE2U66,Tata Consultancy S INR1,Y85279100,\n,1.640725e+06,NS,,4.793153e+07,1.096496,Long,EC,CORP,IN,N,2,\n,
9,Amorepacific Corporation,,Amorepacific Corporation KRW500,Y01258105,\n,2.212900e+05,NS,,3.465539e+07,0.792787,Long,EC,CORP,KR,N,2,\n,


In [26]:
abs_count = pd.DataFrame(df).sum(axis=0)
abs_count[abs_count>1]

29414B104               3.0
46187W107               3.0
925652109               3.0
866674104               4.0
82981J109               2.0
                       ... 
TIME DEPOSITS           2.0
213800YWQHEAX7CAVO83    2.0
916896103               2.0
292671708               2.0
384556106               2.0
Length: 1666, dtype: float64

In [19]:
(abs_count>1).sum()

1544

In [21]:
S5912 = get_most_recent('S000005912').parseSecurities()

In [22]:
S5912

Unnamed: 0,name,lei,title,cusip,identifiers,balance,units,curCd,valUSD,pctVal,payoffProfile,assetCat,issuerConditional,invCountry,isRestrictedSec,fairValLevel,securityLending,issuerCat,derivativeInfo
0,Anworth Mortgage Asset Corp,N2KM5OGWVS7HLNUACF84,Anworth Mortgage Asset Corp,037347101,\n,208768.0,NS,USD,747389.44,0.017110,Long,EC,,US,N,1,\n,,
1,Haynes International Inc,549300I9MS5UZLRFDO40,Haynes International Inc,420877201,\n,26521.0,NS,USD,711028.01,0.016278,Long,EC,,US,N,1,\n,CORP,
2,Codorus Valley Bancorp Inc,,Codorus Valley Bancorp Inc,192025104,\n,20002.0,NS,USD,436043.60,0.009983,Long,EC,,US,N,1,\n,CORP,
3,West Bancorporation Inc,549300GKFOINCHIYU666,West Bancorporation Inc,95123P106,\n,34940.0,NS,USD,799776.60,0.018310,Long,EC,,US,N,1,\n,CORP,
4,Heritage Insurance Holdings Inc,9845008CCYDD87F66B45,Heritage Insurance Holdings Inc,42727J102,\n,55550.0,NS,USD,669933.00,0.015337,Long,EC,,US,N,1,\n,CORP,
5,Sonim Technologies Inc,549300LVHTR09ZVBSB66,Sonim Technologies Inc,83548F101,\n,21471.0,NS,USD,68921.91,0.001578,Long,EC,,US,N,1,\n,CORP,
6,Biohaven Pharmaceutical Holding Co Ltd,5299007STZTNN32O9920,Biohaven Pharmaceutical Holding Co Ltd,000000000,\n,84218.0,NS,USD,4083730.82,0.093491,Long,EC,,VG,N,1,\n,CORP,
7,Forterra Inc,549300M8CB93DI1JD480,Forterra Inc,34960W106,\n,38920.0,NS,USD,504014.00,0.011539,Long,EC,,US,N,1,\n,CORP,
8,Spirit Airlines Inc,549300DCAFIXFNFR3304,Spirit Airlines Inc,848577102,\n,147252.0,NS,USD,6047639.64,0.138451,Long,EC,,US,N,1,\n,CORP,
9,Shutterstock Inc,529900G2TDIS56V0LO22,Shutterstock Inc,825690100,\n,41286.0,NS,USD,1788922.38,0.040955,Long,EC,,US,N,1,\n,CORP,


In [27]:
secs['series_id'].value_counts()

S000026634    1
S000026633    1
S000005912    1
S000012902    1
S000006988    1
S000061158    1
S000026637    1
S000030190    1
S000006974    1
S000025889    1
S000006987    1
S000028928    1
S000026635    1
S000004201    1
S000029442    1
S000030518    1
S000026638    1
S000006990    1
S000005911    1
Name: series_id, dtype: int64

In [54]:
holds.assetCat_amount

Unnamed: 0,assetCat,sum_x,sum_y,sum_x.1,sum_y.1,sum_x.2,sum_y.2,sum_x.3,sum_y.3,sum_x.4,sum_y.4,sum_x.5,sum_y.5,sum_x.6,sum_y.6,sum_x.7,sum_y.7,sum_x.8,sum_y.8
0,EC,204384.33648,294952.799376,88573.747194,51734.671156,,,13763.868506,618236.316789,994619.696683,221600.590393,873842.411295,193233.751309,75969.856461,61097.412745,655411.776758,560378.077247,77528.429048,
1,STIV,7002.741605,,1026.461197,910.391388,4448.267144,59275.726072,259.246905,7228.513813,642.507706,758.428897,6220.356984,250.02203,2676.121742,3029.471857,1626.242129,11304.87885,,17880.299197
2,ABS-CBDO,,,,,11882.520771,9938.2464,,,,,,,,,,,,
3,ABS-MBS,,,,,64057.64076,73826.842423,,,,,,,,,,,,105822.4832
4,ABS-O,,,,,537.58819,1029.27533,,,,,,,,,,,,
5,DBT,,,,,49781.719967,136449.625551,,,,,,,,,,,,228545.157805
6,DIR,,,,,-282.48016,-2881.989383,,,,,,,,,,,,42.930728
7,COMM,,,,,,28461.035453,,,,,,,,,,,,
8,DCO,,,,,,-16899.95756,,,,,,,,,,,,
9,DCR,,,,,,-0.48099,,,,,,,,,,,,
