# Preamble

In [1]:
%matplotlib notebook

## Notebook parameters

In [2]:
NAME = 'a_2_identify_exemption_filers'
PROJECT = 'covid-empirical'
PYTHON_VERSION = '3.9'
USER = 'Ties'
CONDA_ENVIRONMENT = 'covid-empirical'
USE_EXTERNAL_PIPELINE = True

## Run preamble script

In [3]:
%run -i preamble.py 

----------------------------------------------------------------------------------
The following utility functions are loaded and available through `functions.<..>`:
----------------------------------------------------------------------------------

extract_data_edgar_link, fast_load_json, fast_store_json, flatten_multiindex_column, inDB, recreate_edgar_link

----------------------------------------------------------------
The following modules and functions are imported by preamble.py:
----------------------------------------------------------------

copy, delayed, importlib, json, math, np, orjson, os, pd, plt, pqdm_p, pqdm_t, random, re, requests, sys, time, yaml


## Notebook specific imports

In [4]:
## None

-------
# Retrieve 8k filings from EDGAR that match criteria
------

## Parameters

In [5]:
search_endpoint_post = 'https://efts.sec.gov/LATEST/search-index'

search_query = '"88465""covid"'

post_payload = {"q": search_query,
                "dateRange":"custom",
                "category":"custom",
                "startdt":"2020-01-01",
                "enddt":"2021-08-06",
                "forms":["8-K"],
                "page":"1",
                "from":0
               }

In [6]:
res = requests.post(search_endpoint_post, json=post_payload)
page_0_json = res.json()

In [7]:
total_hits = page_0_json['hits']['total']['value']
num_pages = math.ceil(total_hits / 100)

## Loop through all pages and collect data

In [179]:
raw_res_list = []
for page in tqdm(range(num_pages)):
    tmp_payload = {"q": search_query,
                "dateRange":"custom",
                "category":"custom",
                "startdt":"2020-01-01",
                "enddt":"2021-08-06",
                "forms":["8-K"],
                "page":"{}".format(page+1),
                "from":100*page
               }
    res = requests.post(search_endpoint_post, json=tmp_payload)
    if res.status_code == 200:
        page_json = res.json()
        raw_res_list.append(page_json)
    else:
        print(page, 'experienced an issue')
    time.sleep(1)

  0%|          | 0/9 [00:00<?, ?it/s]

## Process data

In [180]:
page_res = page_json['hits']['hits']

In [181]:
full_res_list = []
for page_res in raw_res_list:
    for item in page_res['hits']['hits']:
        ret_dict = copy.deepcopy(item['_source'])
        ret_dict['id'] = item['_id']
        full_res_list.append(ret_dict)

In [200]:
full_res_df = pd.DataFrame(full_res_list)

# Process into dataset

## Clean

### Count number of items

In [201]:
full_res_df['num_items'] = full_res_df['items'].apply(lambda x: len(x))

### Remove those with more than 1 cik

In [202]:
full_res_df['num_cik'] = full_res_df['ciks'].apply(lambda x: len(x))

In [203]:
full_res_df.num_cik.value_counts()

1    840
2      3
3      1
Name: num_cik, dtype: int64

In [204]:
full_res_df = full_res_df[full_res_df.num_cik == 1] 

In [205]:
full_res_df['cik'] = full_res_df['ciks'].apply(lambda x: x[0])


### Create single items instead of lists in DF

Company name

In [206]:
full_res_df.display_names.apply(lambda x: len(x)).value_counts()

1    840
Name: display_names, dtype: int64

In [207]:
full_res_df['comp_name'] = full_res_df['display_names'].apply(lambda x: x[0])

File number

In [208]:
full_res_df['file_num'].apply(lambda x: len(x)).value_counts()

1    840
Name: file_num, dtype: int64

In [209]:
full_res_df['file_num'] = full_res_df['file_num'].apply(lambda x: x[0])

### Clean name

Remove CIK from name

In [218]:
full_res_df['comp_name'] = full_res_df['comp_name'].apply(lambda string: re.sub(' \(CIK.*?\)', '', string))

Extract ticker

In [224]:
def extract_ticker(string):
    hits = re.findall(' \((.*?)\)', string)
    if hits:
        return hits[0]
    else:
        return np.nan

In [225]:
full_res_df['ticker'] = full_res_df['comp_name'].apply(extract_ticker)

Remove ticker

In [227]:
full_res_df['comp_name'] = full_res_df['comp_name'].apply(lambda string: re.sub(' \(.*?\)', '', string))

### Convert date

In [230]:
date_cols = ['period_ending', 'file_date']
for col in date_cols:
    full_res_df[col] = pd.to_datetime(full_res_df[col])

### Clean items

In [259]:
full_res_df['items'] = full_res_df['items'].apply(lambda lst: '--'.join(lst))

## Select columns to keep

In [262]:
cols_to_keep = ['cik', 'ticker', 'file_date', 'comp_name', 'period_ending', 'root_form', 
               'file_num', 'adsh', 'id', 'num_items', 'items', 'file_description']

In [263]:
exemption_df = full_res_df[cols_to_keep]
exemption_df.head()

Unnamed: 0,cik,ticker,file_date,comp_name,period_ending,root_form,file_num,adsh,id,num_items,items,file_description
0,1318482,KDOZF,2020-06-01,KIDOZ INC.,2020-03-27,8-K,333-120120-01,0001318482-20-000004,0001318482-20-000004:ki8k0529.htm,1,8.01,"KIDOZ INC. FORM 8K/A MAY 29, 2020"
1,66600,MMMM,2020-05-21,"Quad M Solutions, Inc.",2020-05-15,8-K,001-03319,0001477932-20-002980,0001477932-20-002980:mmmm_8ka.htm,1,8.01,FORM 8-K/A
2,1654595,"MDRR, MDRRP",2020-04-29,"Medalist Diversified REIT, Inc.",2020-04-29,8-K,001-38719,0001104659-20-053371,0001104659-20-053371:tm2018091d1_8k.htm,1,8.01,FORM 8-K
3,66600,MMMM,2020-05-15,"Quad M Solutions, Inc.",2020-05-15,8-K,001-03319,0001477932-20-002700,0001477932-20-002700:mmmm_8k.htm,1,8.01,FORM 8-K
4,1658880,BRMT,2020-05-21,BARE METAL STANDARD INC.,2020-04-30,8-K,000-55795,0001214659-20-004869,0001214659-20-004869:j5202038k.htm,1,8.01,


## Add a clickable link

In [264]:
def gen_clickable_link(row):
    link = 'https://www.sec.gov/Archives/edgar/data/{}/{}/{}'.format(row['cik'], 
                                                              row['adsh'].replace('-', ''),
                                                              row['id'].split(':')[-1]
                                                             )
    return link

In [265]:
exemption_df['link'] = exemption_df.apply(gen_clickable_link, axis=1)

In [266]:
print(exemption_df.sample(1).iloc[0]['link'])

https://www.sec.gov/Archives/edgar/data/0001746563/000147793220001591/potn_8k.htm


## Save

In [270]:
exemption_df.to_excel(pipeline / 'out' / 'exemptions_8k.xlsx', index=False)