In [6]:
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import requests
from xml.etree import ElementTree as ET
from io import StringIO

# 04-Get recent
***R Persichitte<br>07/21/2020***

## Purpose
This workpaper uses function from the previous three workpapers to accept a ticker symbol and download the most up to date NPORT form.

## Methods
The following block of code imports functions from previous workpapers. It is minimized by default because these functions have already been validated.

In [7]:
# From 02-NPORT function.ipynb
class download_NPORT:
    def __init__(self, link):
        r = requests.get(link)
        assert(r), r
        xml = ET.fromstring(r.text)
        
        # Remove namespace tags
        xml = ET.iterparse(StringIO(r.text))
        for _, el in xml:
            _, _, el.tag = el.tag.rpartition('}') # strip ns
        self.xml = xml.root
    def parseSecurities(self):
        secs = self.xml.find('formData').find('invstOrSecs')
        all_secs = []
        for s in secs:
            t_dict = {}
            for e in s:
                t_dict[e.tag] = e.text 
            all_secs += [t_dict]
        res = pd.DataFrame(all_secs)
                
        # Extra data cleaning to convert strings to numbers
        num_cols = ['valUSD','balance','pctVal']
        
        for c in num_cols:
            res[c] = res[c].astype(float)
        
        return res
    def parseFundInfo(self):
        fund_info = {}
        for e in list(self.xml.find('formData').find('fundInfo')):
            fund_info[e.tag] = e.text
        
        return fund_info
    
# From 03-EDGAR getall
## NOTE: Although not expressly later int he document, the functions call other functions.
## All are necessary.
def get_EDGAR_index(year,quarter):
    assert(int(year)>1933), 'Year before SEC.'
    assert(int(year)<=datetime.now().year), 'Year in future'
    assert(0<int(quarter)<5), 'Invalid quarter.'
    
    
    url = 'https://www.sec.gov/Archives/edgar/full-index/{:}/QTR{:}/master.idx'
    url = url.format(int(year),int(quarter))
    return pd.read_csv(url,sep='|',
                       skip_blank_lines=True,
                       skiprows=[0,1,2,3,4,5,6,7,8,10],
                       parse_dates=['Date Filed'])
def get_filings(CIK,page:int=0,n_records:int=100,type_=''):
    assert(type(page)==int),'Page must be an int.'
    assert(type(n_records==int)), 'n_records must be an int.'
    
    ed_url = 'https://www.sec.gov/cgi-bin/browse-edgar'
    r = requests.get(ed_url,params = dict(
        action='getcompany',
        CIK=CIK,
        count=n_records,
        start=page*n_records,
        type=type_
    ))
    assert(r),r
    
    try:
        soup = BeautifulSoup(r.text,'lxml')
        
        # Look if request is valid
        if "Invalid parameter" in [h.text for h in soup.findAll('h1')]:
            return 'Invalid parameter'
    
        # Get table of filings and parse
        filings = list(soup.find(class_="tableFile2").find_all('tr'))
        header = filings.pop(0)# Drop first row which only contains file headers.
        res = []
        
        if len(filings)>0:
            for f in filings:
                f = list(f.findAll('td'))
                t_dict = dict()
                t_dict['Filings'] = f[0].text
                t_dict['Format'] = f[1].find('a').get('href')
                t_dict['Filing Date'] = f[3].text
                t_dict['File/Film Number'] = f[4]
                res += [t_dict]
                
            res = pd.DataFrame(res)
            res['Filing Date'] = pd.to_datetime(res['Filing Date'])
            return res
        else:
            return ('No results.')
    except:
        print('ERROR PARSING!')
        return r
def get_all_pages(CIK,n_records:int=100,type_=''):
    

    valid = True
    page = 0
    
    res = pd.DataFrame()
    while valid:
        tdf = get_filings(CIK=CIK,
                          page=page,
                          n_records=n_records,
                          type_=type_)
        
        if(type(tdf)!=pd.DataFrame):
            valid = False
            break
        else:
            page+=1
            res = pd.concat([res,tdf])
        
    return res.reset_index(drop=True)

## New code and validation

In [8]:
def get_most_recent(CIK:str,n_records:int=100,type_:str='NPORT-P')->download_NPORT:
    '''CIK: this should be a CIK or Series number for the fund. For companies with multiple funds (E.g. Schwab)
            using the CIK will result in the NPORT for the fund that was filed most recently. Multiple ticker symbols
            can trade under the same CIK, but will have unique Seires numbers. Different classes of the same fund
            will chase a Series number if they are based on the same investment series.
       n_records: specifies the number of records to pull for each EDGAR query.
            The options as of 6/8 are 40,60,80,and 100. This is included for flexibility 
            in case EDGAR changes allowed options.
        type_: is the string to pass to EDGAR regarding type. It should always be NPORT-P, but I am
            making this a variable in case the name changes in the future.
            
        Based on the series ID, this will return a download_NPORT object. Using this object, you can download
        a list of securities in the form of a pandas datafram using the command `parseSecurities()`. You can 
        get a dictionary of general fund infor using `parseFundInfo()`. You can get the XMLetree by accesing
        `xml`.
        '''
    #pull the given url of the most recent.
    recent = (
        get_all_pages(CIK,n_records=n_records,type_=type_)
            .sort_values('Filing Date',ascending=False)
            .head(1)['Format'].values[0]
    )
    
    # The URL by default is the HTML file. We need to modify it to get the XML data.
    ## Strip HTML part of link
    recent = recent[:recent.rfind('/')]
    
    ## Add information to access XML
    link = "https://www.sec.gov{:}/primary_doc.xml".format(recent)
    
    return download_NPORT(link)

    


I am going to manually select one of my holdings to check. The outputs match the values I expected.

In [9]:
comps = pd.read_pickle('Investment Company Series and Class Information.p') # Information from 01
s2 = comps[comps['class_ticker']=='SCHG']
sid = s2['series_id'].values[0]
s2

Unnamed: 0,reporting_file_number,cik,entity_name,entity_org_type,series_id,series_name,class_id,class_name,class_ticker,address_1,city,zip_code,state,address_2
36412,811-22311,1454889,SCHWAB STRATEGIC TRUST,30,S000026633,Schwab U.S. Large-Cap Growth ETF,C000079980,Schwab U.S. Large-Cap Growth ETF,SCHG,211 MAIN STREET,SAN FRANCISCO,94105,CA,[NULL]


In [10]:
dNP = get_most_recent(sid)
secs = dNP.parseSecurities()
secs.head()

Unnamed: 0,name,lei,title,cusip,identifiers,balance,units,curCd,valUSD,pctVal,payoffProfile,assetCat,issuerCat,invCountry,isRestrictedSec,fairValLevel,securityLending,issuerConditional,derivativeInfo
0,Alexion Pharmaceuticals Inc,M1YXUUZR0EIMU8T0EM75,Alexion Pharmaceuticals Inc,15351109.0,\n,140983.0,NS,USD,13256631.49,0.14772,Long,EC,CORP,US,N,1,\n,,
1,Lennar Corp,529900G61XVRLX5TJX09,Lennar Corp,526057302.0,\n,9622.0,NS,USD,460027.82,0.005126,Long,EC,CORP,US,N,1,\n,,
2,Tyler Technologies Inc,,Tyler Technologies Inc,902252105.0,\n,24835.0,NS,USD,7782047.25,0.086716,Long,EC,CORP,US,N,1,\n,,
3,AmerisourceBergen Corp,AI8GXW8LG5WK7E9UD086,AmerisourceBergen Corp,3.0729999999999996e+108,\n,95702.0,NS,USD,8069592.64,0.08992,Long,EC,CORP,US,N,1,\n,,
4,Chicago Mercantile Exchange,SNZ2OJLFK8MNNCLQOF39,E-MINI RUSS 1000 GMAR20 XCME 20200320,0.0,\n,137.0,NC,USD,-1181620.72,-0.013167,,DE,,US,N,1,\n,,\n
