In [23]:
import pandas as pd
import glob

In [24]:
targets = ['contract_num', 'advertiser', 'flight_from', 'flight_to', 'gross_amount']

In [35]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [74]:
def label_tokens(df: pd.DataFrame, target_labels):
    labels = pd.DataFrame()
    df['label'] = '0'
    for target_label in target_labels:
        label_max = df[target_label].max()
        if label_max > 0:
            df.at[df[target_label] == label_max ,'label'] = target_label
    return df

def collect_analytics(df: pd.DataFrame, analytics_df, target_labels, slug):
    analytics_dict={}
    analytics_dict['name'] = [slug]
    analytics_dict['token_count'] = [df.shape[0]]
    for target_label in target_labels:
        maxval = df[target_label].max() 
        analytics_dict[target_label + '_maxval'] = [maxval]
        analytics_dict[target_label + '_total_count'] = [df[df[target_label] > 0].shape[0]]
        analytics_dict[target_label + '_maxval_count'] = [df[(df[target_label] == maxval) & (maxval > 0)].shape[0]]
        analytics_dict[target_label + '_100perc_count'] = [df[df[target_label] == 1.0].shape[0]]
        analytics_dict[target_label + '_75perc_count'] = [df[df[target_label] >= 0.75].shape[0]]
        analytics_dict[target_label + '_50perc_count'] = [df[df[target_label] >= 0.5].shape[0]]
        uniques_df = df.copy()
        uniques_df  = uniques_df.drop_duplicates(subset=['token', 'label'])
        analytics_dict[target_label + '_total_distinct_count'] = [uniques_df[uniques_df[target_label] > 0].shape[0]]
        analytics_dict[target_label + '_maxval_distinct_count'] = [uniques_df[(uniques_df[target_label] == maxval) & (maxval > 0)].shape[0]]
        analytics_dict[target_label + '_100perc_distinct_count'] = [uniques_df[uniques_df[target_label] == 1.0].shape[0]]
        analytics_dict[target_label + '_75perc_distinct_count'] = [uniques_df[uniques_df[target_label] >= 0.75].shape[0]]
        analytics_dict[target_label + '_50perc_distinct_count'] = [uniques_df[uniques_df[target_label] >= 0.5].shape[0]               ]
    analytics_df = pd.concat([analytics_df, pd.DataFrame(analytics_dict)], ignore_index = True)
    return analytics_df

def process_files(src, dest, targets):
    analytics_df = pd.DataFrame()
    i = 0
    total = len(glob.glob(src + '*.parquet'))
    for filename in glob.glob(src + '*.parquet'):
        printProgressBar(i, total)
        i = i+1
        slug = filename.split('/')[-1].split('.')[0]
        df = pd.read_parquet(filename)
        df = label_tokens(df, targets)
        analytics_df = collect_analytics(df, analytics_df, targets, slug)
        df.to_csv(dest + slug + '.tsv', '\t')
    analytics_df.to_csv('../../data/data_analytics.tsv', '\t')
    print('\r')
    print('Processed ' + str(i) + ' out of ' + str(total) + ' files')

In [75]:
process_files("../../thirparty/deepform/data/training/", "../../data/training/", targets)

 |███████████████████████████████████████████████████████████████████████████████████████████████████-| 100.0% 
Processed 8990 out of 8990 files
