In [2]:
# Run on first instance to install required libraries
%pip install PyPDF2
%pip install bs4

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
# console and directory access
import os
import re
import time 
import json
import urllib
import datetime

# interacting with Amazon AWS
import boto3
from sagemaker.session import Session

# data reading and exporting  
import pandas as pd
import numpy as np

# parsing SEC website for data  
import requests
from bs4 import BeautifulSoup

# pdf manipulation
from PyPDF2 import PdfFileReader, PdfFileWriter, utils

## CIK Extraction from RSSD Company Names
Extract URL links per companies as stored from the RSSD

In [4]:
def baseURL(company_name:str) -> str:
    """
    Constructs a base URL for searching for a paritcular SEC filing  
    ------------------------------------------------------------------------------------------
    Input:
        :param: company_name (type str)
            The company name corresponding number for a registreed broker-dealer
            
    Return:
        :param: url (type str)
            A URL string that points to the EDGAR webpage of a registred broker dealer
            (e.g. https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=1904&type=X-17A-5&dateb=20201231)
    """
    
    # forming the SEC search URLs from the select CIK, file type and date range
    secFormat = 'https://www.sec.gov/cgi-bin/browse-edgar?company='     # SEC base url
    comp_name = '+'.join(company_name.split(' '))                       # company name in SEC format
    selection = '&match=&filenum=&State=&Country=&SIC=&myowner=exclude&action=getcompany'    

    # build lookup URLs for the SEC level data (base url)
    url = secFormat + comp_name + selection
    
    return url

In [5]:
def cleanCompName(company_name:str) -> str:
    """
    Constructs a cleaner string name for a corresponding company name
    ------------------------------------------------------------------------------------------
    Input:
        :param: company_name (type str)
            The company name corresponding number for a registreed broker-dealer
            
    Return:
        :param: (type str)
            A clean company name removing all unecessary spaces within a company-name string 
    """
    
    # forming the SEC search URLs from the select CIK, file type and date range
    name_list = list(filter(lambda x: x if x != '' else 0, company_name.split(' ')))
    
    return ' '.join(name_list)


In [6]:
def cikParse(url:str, company_name:str) -> str:
    """
    Parses the CIK webpage of a provided URL and returns a tuple of arrays/lists
    ------------------------------------------------------------------------------------------
    Input:
        :param: url (type str) 
            URL is a string representing a SEC website URL pointing to a CIK for X-17A-5 filings
            e.g. https://www.sec.gov/cgi-bin/browse-edgar?company=BEAR+STEARNS+ASSET+BACKED+SECURITIES+I+ ...
                       LLC&match=&filenum=&State=&Country=&SIC=&myowner=exclude&action=getcompany
        :param: company_name (type str) 
            The company name as represented from RSSD terms
    Return:
        :param: cik (type str)
            
    """
    
    # read in HTML tables from the url link provided 
    try:
        # due to web-scrapping non-constant behavior (check against 100 tries)
        for _ in range(100):
            try: 
                filings = pd.read_html(url) 
                break
            
            # if no table found from HTTP request, we return None
            except ValueError: return None
            
            # if we flag an HTTPError, or others (simply send another request)
            except: pass
            
        try:
            # we are interested in the first table present within the html read
            filing_table = filings[0]    
        except UnboundLocalError:
            return None
        
        try:
            # a basic format for identifying companies
            filing_table['Company'] = filing_table['Company'].apply(lambda x: x.split('SIC')[0]).values   
            
            # return a the CIK that correspond to that company name 
            cik = filing_table[filing_table.Company == company_name].CIK.iloc[0]
            return cik
        
        # if KeyError flagged, then we don't have the 'Company' name header in table
        except KeyError: 
            
            # will need to request response form HTML server and then extract the CIK 
            for _ in range(100):
                try: 
                    response = requests.get(url)
                    if response.status_code == 200:
                        break
                except urllib.error.HTTPError: pass
            
            # last check to see if response object is "problamatic" e.g. 403
            if response.status_code != 200:
                return None
    
            s1 = BeautifulSoup(response.text, 'html.parser')
    
            # search through all the '<a href' links from the site
            for link in s1.find_all('a'):
                data = link.get('href').split('CIK=')      # locate the CIK break in space
                
                if len(data) > 1:
                    cik = data[1].split('&')[0]            # if we find CIK then we split at '&' break
                    return int(cik)
            
    # if there exists no active reports for a given CIK, we flag the error
    except IndexError:
        print('\tno CIK was present to match this firm')
        return None

## Main File Execution

In [12]:
if __name__ == "__main__":
    
    # Amazon Textract client and Sagemaker session
    s3 = boto3.client('s3')
    bucket = "ran-s3-systemic-risk"
    
    ffiec_all_names = pd.read_csv('ALL-RSSD-ID.csv')
    ffiec_all_names['NM_LGL'] = ffiec_all_names['NM_LGL'].apply(lambda x: 
                                                                cleanCompName(x))  # clean company name, omitting all 
    
    # --------------------------------------------------
    if os.path.isfile('NAME-CIK.json'):
        with open('NAME-CIK.json', 'r') as f: name2cik = json.loads(f.read())
    else:
        name2cik = {}
    # --------------------------------------------------
    
    special_n = 0
    
    # iterate through the corresponding company names for RSSD_IDs
    for idx, name in enumerate(ffiec_all_names['NM_LGL'].values[special_n:]):
        print('\nChecking for a CIK for {}'.format(name))
        
        # compute the url for search and retrieve the accompanying CIK
        url = baseURL(name)
        print('\tcurrent URL is {}'.format(url))
        cik_num = cikParse(url, name)
        
        if cik_num is not None:
            # extend dictionary if a CIK mapping is present, report None (otherwise)
            print('\tCIK was found: {}'.format(cik_num))
            name2cik[name] = str(cik_num)
        else:
            print('\tno CIK was found')
        
        # we locally store the updated CIK json every 10000k iterations
        if (idx + 1) % 10000 == 0:
            print('\n\n###################################################')
            print('STORING JSON LOCALLY - {} itterations'.format(idx+1+special_n))
            print('###################################################\n\n')
            with open('NAME-CIK.json', 'w') as file:
                json.dump(name2cik, file)
                file.close()
        
    ####################################
    # EXPORTING ALL RSSD-CIK TO THE s3
    ####################################
    
    # write to a JSON file with accompanying meta information about coverage 
    with open('NAME-CIK.json', 'w') as file:
        json.dump(name2cik, file)
        file.close()
        
    # filter corresponding name matches that correspond to CIK values
    ffiec_all_names['CIK'] = ffiec_all_names['NM_LGL'].replace(name2cik)
    rssd2cik = ffiec_all_names[ffiec_all_names.NM_LGL.isin(name2cik.keys())]
    
    # export the RSSD to CIK
    rssd2cik.to_csv('RSSD-CIK.csv', index=False)
    
    # save contents to AWS S3 bucket
    with open('RSSD-CIK.csv', 'rb') as data:
        s3.upload_fileobj(data, bucket, 'Temp/RSSD-CIK.csv')
        
    print('RSSDs have been mapped to CIKs')

In [13]:
# 4589 (post active) -> 5341 (post 70k closed) -> 8769 (post 140k closed) -> 8786 (post new 210k)
len(name2cik)

8786

In [14]:
rssd2cik

Unnamed: 0,#ID_RSSD,NM_LGL,PRIM_FED_REG,CHTR_TYPE_CD,ORG_TYPE_CD,ENTITY_TYPE,CITY,STATE_ABBR_NM,CNTRY_NM,ID_TAX,ID_LEI,CIK
16,43351,PEOPLES STATE BANK,FDIC,Commercial Bank,Corporation (stock),Non-member Bank,MANHATTAN,KS,UNITED STATES,480308430,0,838010
67,44741,CITIZENS BANK,FDIC,Commercial Bank,Corporation (stock),Non-member Bank,MOORESVILLE,IN,UNITED STATES,350227210,549300CABYW2DE2UTD57,230119
71,58243,JOHNSON BANK,FRS,Commercial Bank,Corporation (stock),State Member Bank,RACINE,WI,UNITED STATES,391141446,IWRZQFYIRJ0IMURZBB68,902143
75,58757,KENSINGTON BANK,FDIC,Commercial Bank,Corporation (stock),Non-member Bank,KENSINGTON,MN,UNITED STATES,410257104,549300Q9WQZE336R0U94,1173376
77,58971,WASHINGTON TRUST BANK,FDIC,Commercial Bank,Corporation (stock),Non-member Bank,SPOKANE,WA,UNITED STATES,910462347,549300P2GEYNMH5OQA71,861787
...,...,...,...,...,...,...,...,...,...,...,...,...
211051,3999673,GOLDMAN SACHS GLOBAL EQUITY OPPORTUNITIES FUND...,,Other Non-Depository Institution,Corporation (stock),International Non-bank Subs of Domestic Entities,DUBLIN,0,IRELAND,0,0,1233961
211062,3998797,"GOLDMAN SACHS DGC INVESTORS OFFSHORE HOLDINGS,...",,Other Non-Depository Institution,Limited Partnership,International Non-bank Subs of Domestic Entities,GEORGE TOWN,0,CAYMAN ISLANDS,0,0,1472038
211070,3999758,JUST OPTIONS LLC,,Other Non-Depository Institution,LLC/C (Limited liability company/corporation),Domestic Entity Other,CHICAGO,IL,UNITED STATES,710991928,0,1427095
211096,3999525,GLOBAL LONG SHORT PARTNERS/HEDGE FUND OPPORTUN...,,Other Non-Depository Institution,Limited Partnership,Domestic Entity Other,NEW YORK,NY,UNITED STATES,261558024,549300TDDHWUQHJYKD92,1424509
