# Preamble

In [None]:
%matplotlib notebook

## Notebook parameters

In [1]:
NAME = 'a_6b_parse_filings'   
PROJECT = 'covid-empirical'
PYTHON_VERSION = '3.9'
USER = 'linuxBox' ## Note, this notebook is designed to run on Linux.
CONDA_ENVIRONMENT = 'covid-empirical'
USE_EXTERNAL_PIPELINE = True

## Run preamble script

In [None]:
%run -i preamble.py 

## Notebook specific imports

In [None]:
from bs4 import BeautifulSoup
import html
import unidecode
from zipfile import ZipFile

In [None]:
from itertools import islice
from functools import partial

# Logic

## Rough clean and deal with the style

In [18]:
def parse_style(style_string:str) -> str:
    style_list = style_string.split(';')
    style_dict = {}
    for style_item in style_list:
        style_v = style_item.split(':')[-1]
        ## Check for font size
        if 'font-size' in style_item:
            font_size = ''
            if style_v == 'large':
                font_size = 'large'
            else:
                try:
                    size = int(style_v[:2])
                    if size > 12:
                        font_size = 'large'
                except:
                    pass
            
            if font_size:
                style_dict['font-size'] = 'large'

        ## Check for bold
        if 'font-weight' in style_item:
            if style_v in ['bold', 'bolder']:
                style_dict['font-weight'] = 'bold'
            elif style_v[0].isdigit():
                style_v = int(style_v)
                if style_v > 400:
                    style_dict['font-weight'] = 'bold'

        ## Check for italic
        if 'font-style' in style_item:
            if style_v == 'italic':
                style_dict['font-style'] = 'italic'
                
        ## Check for underline (but treat as italic for ease of coding below)
        if 'text-decoration' in style_item:
            if style_v == 'underline':
                style_dict['font-style'] = 'italic'

    new_style_string = ''
    for k,v in style_dict.items():
        new_style_string += f'{k}:{v}; '
    new_style_string =  new_style_string.strip()
    
    return new_style_string

In [19]:
style_conv_list = [
    {
        'name' : 'bold',
        'names':  ['b', 'B', 'strong'],
        'style_to_add' : 'font-weight:bold;'
    },
    {
        'name' : 'italic',
        'names':  ['i', 'I', 'em', 'u', 'U'],
        'style_to_add' : 'font-style:italic;' 
    },
    {
        'name' : 'size',
        'names':  ['h1', 'h2', 'h3', 'H1', 'H2', 'H3'],
        'style_to_add' : 'font-size:large;' 
    },
]

style_chars = []
for x in style_conv_list:
    style_chars += x['names']

In [20]:
def sanitize_html(snippet:str, attrs_to_keep:list = [], verbose:int = 0, remove_tables:bool = True) -> tuple:

    ## Clean style
    snippet = re.sub('(<style>.*?</style>)', ' ', snippet, flags = re.DOTALL)
    snippet = snippet.replace('\n', ' ')

    ## Clean iXBRL at the start
    snippet = re.sub('(<ix:header>.*?</ix:header>)', ' ', snippet, flags = re.DOTALL)

    ## Clean inline XBRL in notes
    for i in range(10):
        if 'ix:' in snippet:
            snippet = re.sub('(<ix:.*?>)(.*?)(</ix:.*?>)', r'\2', snippet, flags = re.DOTALL)
        else:
            break

    ## Remove HTML comments
    snippet = re.sub('(<!--.*?-->)', ' ', snippet, flags = re.DOTALL)

    ### Fix whitespace
    snippet = snippet.replace('&nbsp;', ' ')
    
    ## Remove unused HTML elements
    for char in style_chars:
        snippet = re.sub(f'<{char}></{char}>', ' ', snippet)

    ### Process HTML features such as attributes and styles

    snippet_tree = BeautifulSoup(snippet, "html")

    for element in snippet_tree():
        ## Move HTML styling through names to the style attribute
        for style_conv_item in style_conv_list:
            match = False
            if element.name in style_conv_item['names']:
                match = True
            if not match:
                for name in style_conv_item['names']:
                    if element.find_parents(name=name):
                        match = True
            
            ## Certain elements should never be a style to avoid conflict with the TOC
            if element.name in ['a', 'div']:
                match = False
            
            if match:
                existing_style = ''
                if element.attrs:
                    if 'style' in element.attrs:
                        existing_style = element.attrs['style']
                element.attrs['style'] = existing_style + style_conv_item['style_to_add']
        
        ## Deal with attributes
        if element.attrs:
            tmp_dict = copy.deepcopy(element.attrs)
            for k,v in tmp_dict.items():
                if element.name in ['table', 'TABLE']: ## Never keep styles for tables
                    try:
                        del element.attrs[k]
                    except:
                        pass
                else:
                    ## Deal with styles
                    if k == 'style' and 'style' in attrs_to_keep:
                        new_style =  parse_style(v)
                        if new_style:
                            element.attrs[k] = new_style
                        else:
                            try:
                                del element.attrs[k]
                            except:
                                pass

                    if element.name in ['a', 'div']:
                        if k not in ['href', 'id', 'name']:
                            try:
                                del element.attrs[k]
                            except:
                                pass

                    ## Remove unwanted styles
                    if k not in attrs_to_keep:
                        try:
                            del element.attrs[k]
                        except:
                            pass

    snippet = str(snippet_tree).strip()
    
    ## Replace break chars with newline
    for break_char in ['<br/>', '<br>', '<BR>', '<BR/>']:
        snippet = snippet.replace(break_char, '\n')
    
    ## Remove page numbers
    snippet = re.sub('''<\w+?>\s*?\d{1,3}\s*?</\w+?>''', ' ', snippet)
    
    ### Get rid of weird HTML characters
    snippet = html.unescape(snippet)

    ### Get rid of control characters
    snippet = unidecode.unidecode(snippet)

    ## Deal with tables
    tables_of_contents = ''
    if remove_tables:
        snippet = re.sub('<TABLE>', '<table>', snippet)
        snippet = re.sub('</TABLE>', '</table>', snippet)
        split_ele = re.split('(?:<table>)', snippet)
        new_str, num_tables_removed = '', 0
        for i, ele in enumerate(split_ele):
            include_table = False
            tmp = ele.split('</table>')
            if len(tmp) > 1:
                table_str_raw, rest_str = tmp[0], tmp[1]
                table_str = ' '.join([x for x in re.sub('</*.*?>', ' ', table_str_raw).split(' ') if x])

                is_toc = False
                if table_str:
                    ## Check whether the table is the table of contents
                    if table_str_raw.count('href') > 10:
                        if 'factors' in table_str.lower() or 'item' in table_str.lower() or 'analysis' in table_str.lower():
                            tables_of_contents += table_str_raw
                            is_toc = True
                            if verbose > 0:
                                print('Table of content found')

                    ## Check if the table has more than 15% numbers
                    if not is_toc:
                        num_digit = len(re.findall('\d', table_str))
                        num_alphanum = len(re.findall('\w', table_str))

                        if num_alphanum:
                            perc_digit = num_digit / num_alphanum

                            if perc_digit <= 0.15:
                                include_table = True

                if include_table:
                    ret_str = table_str_raw + rest_str
                else:
                    ret_str = rest_str
                    num_tables_removed += 1
                    if verbose > 1:
                        print(f'Table removed with perc: {perc_digit}%')
            else:
                ret_str = ele

            new_str += ret_str
        
        ## Overwrite string with the string without tables
        snippet = new_str

        if verbose > 0:
            print(f'Number of tables removed: {num_tables_removed}')

    ## Remove duplicate whitespaces
    for ws in [' ', '\n']:
        snippet = ws.join([x for x in snippet.split(ws) if x]).strip() 
        
    ## Return
    return snippet, tables_of_contents

In [21]:
def html_to_str(snippet:str, keep_linebreak:bool = False) -> str:  
    ## Get rid of HTML
    snippet = ' '.join([x for x in re.sub('</*.*?>', ' ', snippet).split(' ') if x])

    ### Final remove duplicate whitespace and get rid of newline characters
    if keep_linebreak:
        for ws in [' ', '\n']:
            snippet = ws.join([x for x in snippet.split(ws) if x]).strip() 
    else:
        snippet = ' '.join([x for x in snippet.split() if x]).strip()
    
    return snippet

## Extract the sections using the Table of Contents

### Identify the table of contents

In [22]:
def extract_toc_items(toc:str, cleaner_html:str) -> pd.DataFrame:
    if not toc:
        raise Exception("No table of contents was found...")
        
    toc_items = []
    for row in re.findall('<tr>(.*?)</tr>', toc, flags=re.DOTALL):
        hrefs_found = re.findall('(<a.*?</a>)', row, flags=re.DOTALL)
        if hrefs_found:
            href_found = hrefs_found[0]
            toc_label = ' '.join([x for x in re.findall('>(.*?)<', row, flags=re.DOTALL) if x])
            toc_label = toc_label.strip()
            toc_label = re.sub('\s\d+?$', '', toc_label)
            toc_label = ' '.join([x for x in toc_label.split() if x])
            if toc_label:
                item_number = ''
                tmp = re.findall('[iI]tem\s+(\w+)\W*', row)

                if tmp:
                    item_number = tmp[0]

                href = ''
                tmp = re.findall('href=[\'\"](.*?)[\'\"]', href_found, flags=re.DOTALL)
                if tmp:
                    href = tmp[0] 

                if '#' in href:
                    href = '#' + '#'.join(href.split('#')[1:])  
                    toc_items.append({
                        'label' : toc_label,
                        'number' : item_number,
                        'href' : href
                    })

    if not toc_items:
        raise Exception("No table of contents was found...")
        
    toc_df = pd.DataFrame(toc_items)

    ## Some companies, especially for 10Qs might split up the A tags for some reason, this joins them
    new_df = pd.DataFrame()
    for href, df in toc_df.groupby('href'):
        df = df.drop_duplicates()
        new_label = ' '.join(df.label.values)
        df.iloc[0]['label'] = new_label
        new_df = new_df.append(df.iloc[0])

    new_df = new_df.sort_index() ## This is quite important, as the order is nescessary
    toc_df = new_df
    
    ## This makes sure only valid HREFs are included
    valid_hrefs = []
    for href in toc_df.href.to_list():
        search_id = f'''="{href.split('#')[-1]}"'''

        if search_id in cleaner_html:
            valid_hrefs.append(href)
    toc_df = toc_df[toc_df.href.isin(valid_hrefs)]

    ## Identify end
    toc_df['end'] = toc_df['href'].shift(-1)
    
    if len(toc_df) < 3:
        raise Exception("No table of contents was found...")
    
    return toc_df

### Identify how the table of content is linking to the sections

In [23]:
def identify_toc_anchors(toc_df:pd.DataFrame, cleaner_html:str) -> str:
    first_id = toc_df.iloc[0]['href'].replace('#', '')
    second_id = toc_df.iloc[1]['href'].replace('#', '')

    search_regex = '''(<[\w\s:'";=-]+?[a-zA-Z]{1,6}="''' + first_id + '''"[\w\s:'";=-]*?>)'''
    search_hits = re.findall(search_regex, cleaner_html)
    valid_anchor_found = False
    anchor_template =  ''
    if search_hits:
        search_hit = search_hits[0]
        if search_hit:
            anchor_name = search_hit.split()[0].replace('<', '')
            tmp = re.findall(' (\w+?)="{}"'.format(first_id), search_hit)
            if tmp:
                anchor_id_attr = tmp[0]

                anchor_template = '<' + anchor_name + '''[\w\s:'";=-]+?''' + anchor_id_attr + '''="{}"[\w\s:'";=-]*?>'''

                if re.findall(anchor_template.format(first_id), cleaner_html):
                    if re.findall(anchor_template.format(second_id), cleaner_html):
                        valid_anchor_found = True
    if not valid_anchor_found:
        raise Exception('No valid anchors found...')
        
    return anchor_template

### Extract the sections

In [24]:
def extract_sections(toc_df:pd.DataFrame, anchor_template:str, cleaner_html:str) -> list:
    section_list = []

    ## Pretext
    first_anchor_regex = anchor_template.format(toc_df.iloc[0]['href'].replace('#', ''))
    raw_section_text = re.split(first_anchor_regex, cleaner_html)[0]

    section_list.append({
        'label' : 'pretext',
        'number' : '',
        'raw_text' : raw_section_text
    })


    for index, row in toc_df.iterrows():
        if index < len(toc_df)-1: 
            start, end = anchor_template.format(row['href'].replace('#', '')), anchor_template.format(row['end'].replace('#', ''))
            tmp_start = re.split(start, cleaner_html)[-1]
            raw_section_text = re.split(end, tmp_start)[0]

        else:
            start = anchor_template.format(row['href'].replace('#', ''))
            raw_section_text = re.split(start, cleaner_html)[1]

        section_list.append({
            'label' : row['label'],
            'number' : row['number'],
            'raw_text' : raw_section_text
        })
        
    return section_list

### Deal with the headers and remove HTML

In [25]:
def parse_out_text_and_header(section_list:list) -> list:
    valid_text_endings = ['.', '?', '!', ':', "'", '"']
    filing_sub_section_list = []
    for section_i, section_item in enumerate(section_list):
        raw_text = section_item['raw_text']

        ## Fix page breaks splitting up paragraphs
        raw_text = re.sub('''([\w,;])( *<[a-zA-Z\\/<>\s-]+?<hr[\w\s:'";=\\/-]*?>[a-zA-Z\\/<>\s]+?> *)([\w])''', r'\1 \3', raw_text)

        ## Replace divs, tr with line breaks
        for name in ['div', 'tr', 'p', 'html']:
            raw_text = raw_text.replace(f'<{name}>', '\n')
            raw_text = raw_text.replace(f'</{name}>', '\n')

        for name in ['div', 'p']:
            raw_text = re.sub(f'''<{name}([\w\s:'";=\\/-]*?)>''', r'\n<html_ele\1>', raw_text)


        raw_text = '\n'.join([x for x in raw_text.split('\n') if x])

        ## Deal with all other HTML relements
        unique_html_elements = [x for x in set(re.findall('</*(.*?)>', raw_text)) if '=' not in x]
        for name in unique_html_elements:
            raw_text = raw_text.replace(f'<{name}', '<html_ele')
            raw_text = raw_text.replace(f'</{name}>', '</html_ele>')

        ## Force inject newlines for headers that are joined with the text
        tmp_list = [x for x in raw_text.split('\n') if x.strip(' ')]
        tmp_list_2 = []
        for item in tmp_list:
            if 'style="' in item:
                item_tmp = re.sub('(<html_ele>)(.*?)(</html_ele>)', r' \2 ', item).strip()
                sub_items = item_tmp.split('<html_ele style="')

                first_item = '<html_ele style="'.join(sub_items[:2])
                num_char_pre = len(sub_items[0].strip())
                if num_char_pre < 4:
                    item = re.sub('(<html_ele style=".*?>)(.*?)(</html_ele>)', r'\1\2\3\n', first_item)

                    ## Sometimes there might be multiple styles later on, which are non-headers by default. But we don't want to lose them. 
                    if len(sub_items) > 2:
                        text_to_add = '<html_ele style="' + '<html_ele style="'.join(sub_items[2:])
                        text_to_add = re.sub(' style=".*?"', '', text_to_add) ## Remove style as not a header
                        item += text_to_add
                else:
                    ## Not a header, so remove style
                    item = re.sub(' style=".*?"', '', item)    

            tmp_list_2.append(item)

        raw_text = '\n'.join(tmp_list_2)
 
        raw_text = raw_text.replace('</html_ele>', ' ')
        raw_text = raw_text.replace('<html_ele>', ' ')
        raw_text = raw_text.replace('<html_ele>', ' ')

        raw_text = ' '.join([x for x in raw_text.split(' ') if x])

        tmp_list = [x for x in raw_text.split('\n') if x.strip(' ')]

        sub_section_list = []
        for _i, item in enumerate(tmp_list):
            ss_dict = {
                'section_label': section_item['label'],
                'section_number' : section_item['number'],
                'section_i' : section_i,
                'type' : 'text',
                'clean_text' : ' '.join([x for x in re.sub('</*.*?>', ' ', item).split() if x])
            }

            ## Attempt to fix cases where there are special characters at the front
            ss_dict['clean_text'] = re.sub('^[\W\d]*', '', ss_dict['clean_text']).strip()

            if len(ss_dict['clean_text']) > 3:

                if 'html_ele style=' in item:
                    if 'font-style:italic' in item:
                        ## Sometimes italic can occur in the text, which is not a sub-header. So I require a preceding \n
                        if _i > 0:
                            ss_dict['type'] = 'sub-header'

                    if 'font-weight:bold' in item:
                        if 'font-style:italic' in item:
                            ss_dict['type'] = 'sub-header' ## If italic and bold --> subheader
                        else:
                            ss_dict['type'] = 'header'
                    if 'font-size:large' in item:
                        ss_dict['type'] = 'header'

                if len(ss_dict['clean_text']) > 3:

                    # But text should be longer than 50 otherwise it should be unknown
                    if ss_dict['type'] == 'text' and len(ss_dict['clean_text']) < 50:
                        ss_dict['type'] = 'unknown'

                    # Valid text should start with a capital letter and end with an ending char
                    if ss_dict['type'] == 'text' and not (ss_dict['clean_text'][0].isupper() and ss_dict['clean_text'][-1] in valid_text_endings):
                        ss_dict['type'] = 'unknown'

                    # If starts with a bracket, consider unknown
                    if ss_dict['clean_text'][0] in ['<', '(', '[']:
                        ss_dict['type'] = 'unknown'

                    sub_section_list.append(ss_dict)

        ## This will add an index to enable recreation of sorting

        for i, item in enumerate(sub_section_list):
            item['i'] = i

        filing_sub_section_list += sub_section_list

    return filing_sub_section_list

### Final clean

In [26]:
def clean_final(filing_sub_section_list:list) -> pd.DataFrame:
    
    ## ---------------------------------------------------
    ### Deal with cases where the TOC contains subsections
    ## ---------------------------------------------------
    
    new_list = []
    prev_label = ''
    label_change_tracker = ''
    for item in filing_sub_section_list:
        if 'item' in item['section_label'].lower():
            prev_label =  item['section_label']    
        elif prev_label:
            if label_change_tracker != item['section_label']:
                ## Inject the header into the list
                new_list.append({
                    'section_label': prev_label,
                    'section_number' : '',
                    'section_i' : 0,
                    'type' : 'header',
                    'clean_text' : item['section_label']
                })
                label_change_tracker = item['section_label']

            item['section_label'] = prev_label

        ## Add item
        new_list.append(item)
        
    filing_sub_section_list = new_list
    full_df = pd.DataFrame(filing_sub_section_list)
    full_df['i'] = full_df.groupby('section_label').cumcount() ## Reset within section index
    
    ## ----------------------------------------------------
    ## Remove identical items that occur more than 10 times
    ## ----------------------------------------------------
    
    full_df['identical_count'] = full_df.groupby('clean_text')['i'].transform(np.size)
    full_df = full_df[full_df.identical_count < 11].reset_index(drop=True)
    full_df['i'] = full_df.groupby('section_label').cumcount() ## Reset within section index
    full_df = full_df.drop(['identical_count', 'test'], axis=1, errors='ignore')

    ## ----------------------------------------------------------------------------------------------------
    ## Quite frequently valid pieces of text get split because of page breaks, this attempts to rejoin them
    ## ----------------------------------------------------------------------------------------------------

    filing_sub_section_list = full_df.to_dict('records') 

    valid_text_endings = ['.', '?', '!', "'", '"']
    new_list = []
    prev_item, skip_tracker, num_dropped = '', '', 0
    for i, item in enumerate(filing_sub_section_list):
        if i > 0:
            if item['type'] == 'unknown' and prev_item['type'] != 'unknown':
                if item['clean_text'][0].isupper():
                    for next_i in range(3):
                        try:
                            next_item = filing_sub_section_list[i + 1 + next_i]
                            next_next_item = filing_sub_section_list[i + 2 + next_i]
                            if not skip_tracker and next_item['type'] == 'unknown' and next_next_item['type'] != 'unknown' and next_item['clean_text'][-1] in valid_text_endings:
                                skip_tracker = next_item['section_number'] + next_item['type'] + next_item['clean_text']
                                item['clean_text'] = item['clean_text'].strip() + ' ' + next_item['clean_text'].strip()
                                if item['clean_text'][0].isupper() and item['clean_text'][-1] in valid_text_endings and len(item['clean_text']) > 50:
                                    item['type'] = 'text' 
                                break
                        except IndexError:
                            pass

        if item['section_number']+item['type']+item['clean_text'] != skip_tracker:    
            new_list.append(item)
        else:
            num_dropped += 1
            skip_tracker = ''

        prev_item = item 
        
    filing_sub_section_list = new_list

    ## ----------------------
    ## Create final dataframe
    ## ----------------------
    
    full_df = pd.DataFrame(filing_sub_section_list)

    full_df['i'] = full_df.groupby('section_label').cumcount() ## Reset within section index
    
    return full_df

## Combo function

In [27]:
def run_all_for_filing(link:str, raw_html:str, parent_store_loc:str, retrn:bool = True, force_overwrite:bool = False) -> tuple:
    ## Setup
    parent_store_loc = Path(parent_store_loc)
    filing_dict = functions.extract_data_edgar_link(link)
    
    folder_full = parent_store_loc / 'full' / filing_dict['cik_padded']
    folder_split = parent_store_loc / 'split' / filing_dict['cik_padded']
    filename = filing_dict['uniqueID'] + '.json.gz'
    
    ## Check if exists:
    if not force_overwrite:
        if folder_split.exists():
            if (folder_split / filename).exists():
                return True, 'Already exists'
    
    try:
        ## Rough clean of HTML and get TOC tables
        cleaner_html, toc = sanitize_html(raw_html, attrs_to_keep=['href', 'style', 'id', 'name'], remove_tables=True, verbose = 0)

        ## -------------------------------------------------------------
        ## Parse full filing without splitting into sections and headers
        ## -------------------------------------------------------------

        full_filing_clean = html_to_str(cleaner_html)

        if not folder_full.exists():
            os.mkdir(folder_full)

        functions.fast_store_json({'link' : link, 'filing_text' : full_filing_clean}, folder_full / filename)

        ## ----------------------------------------
        ## Advanced parse with sections and headers
        ## ----------------------------------------

        ## Set up the TOC for section identification
        toc_df = extract_toc_items(toc, cleaner_html)
        anchor_template = identify_toc_anchors(toc_df, cleaner_html)

        ## Extract sections
        section_list =  extract_sections(toc_df, anchor_template, cleaner_html)

        ## Parse sections to identify headers, identify valid text, and remove HTML
        parsed_sub_section_list = parse_out_text_and_header(section_list)

        ## Final clean of the extracted data
        final_df = clean_final(parsed_sub_section_list)
        final_list = final_df.to_dict('records')

        ## Save / Return
        ret_dict = {
            'link' : link,
            'filing_details' : filing_dict,
            'filing_list' : final_list
        }

        if not folder_split.exists():
            os.mkdir(folder_split)

        functions.fast_store_json(ret_dict, folder_split / filename)

        if retrn:
            return True, ret_dict
        else:
            return True, None

    except Exception as e:
        return False, e

----------------------
# Run

# Params

In [28]:
attrs_to_keep=['href', 'style', 'id', 'name']
remove_tables=True
verbose = 1
parent_store_loc = ePipeline / 'out'
retrn = False 
cols_to_keep = ['filingDate', 'reportDate', 'acceptanceDateTime', 'form', 'accessionNumber' , 'primaryDocument', 'size', 'isInlineXBRL']

## Filing meta data

In [29]:
download_pipeline = Path.cwd() / '2_pipeline' / 'a_6a_download_raw_filings'

In [30]:
filing_df = pd.read_hdf(download_pipeline / 'out' / 'filing_df.h5')

## Identify done filings

In [31]:
download_e_pipeline = externalPipelineFolder / 'a_6a_download_raw_filings'

In [32]:
raw_store_loc = download_e_pipeline / 'out' / 'raw'
done_list = [x.name.replace('.json.gz', '') for x in raw_store_loc.glob('*.json.gz')]

In [33]:
done_download_df = filing_df[filing_df.uniqueID.isin(done_list)]

In [34]:
len(done_download_df)

81110

## Process filings

In [35]:
import warnings
warnings.filterwarnings("ignore")

In [36]:
def combo_func(link:str, raw_store_loc = raw_store_loc, force_overwrite:bool=False):
    with warnings.catch_warnings():
        try:
            warnings.simplefilter("ignore")
            filing_details = functions.extract_data_edgar_link(link)
            file_loc = raw_store_loc / f'''{filing_details['uniqueID']}.json.gz'''
            raw_html = functions.fast_load_json(file_loc)['raw']
            status, res = run_all_for_filing(link, raw_html, parent_store_loc.as_posix(), retrn = retrn, force_overwrite=force_overwrite)
        except Exception as e:
            return False, str(e)

In [39]:
%%time
while True:
    all_list = done_download_df.uniqueID.to_list()
    done_list = [x.name.replace('.json.gz', '') for x in (parent_store_loc / 'full').glob('**/*.json.gz')]
    todo_list = list(set(all_list).difference(set(done_list)))
    todo_df = done_download_df[done_download_df.uniqueID.isin(todo_list)]
    todo_df = todo_df.sample(len(todo_df.index))
    todo_link_list = todo_df['link'].to_list()
    print(f'Number done: {len(done_list):,}, Number todo: {len(todo_list):,}, which is {len(todo_list) / len(all_list) * 100:.0f}% left to complete.')

    res_list = Parallel(n_jobs=20)(delayed(combo_func)(link) for link in todo_link_list)
    
    if len(todo_link_list) < 10:
        break

Number done: 81,109, Number todo: 1, which is 0% left to complete.
CPU times: user 1.28 s, sys: 524 ms, total: 1.81 s
Wall time: 2.86 s
