In [3]:
import pandas as pd
import glob
from tqdm import tqdm

In [4]:
targets = ['contract_num', 'advertiser', 'flight_from', 'flight_to', 'gross_amount']

In [16]:
def label_tokens(df: pd.DataFrame, target_labels):
    labels = pd.DataFrame()
    df['label'] = '0'
    for target_label in target_labels:
        label_max = df[target_label].max()
        if label_max > 0:
            df.at[df[target_label] == label_max ,'label'] = target_label
    return df

def collect_analytics(df: pd.DataFrame, analytics_df, target_labels, slug):
    analytics_dict={}
    analytics_dict['name'] = [slug]
    analytics_dict['token_count'] = [df.shape[0]]
    for target_label in target_labels:
        maxval = df[target_label].max() 
        # Each token is labele with a "probability" of belonging to a certain entity. 
        # This quantification aims to get a better overview over the data and how it is labeled
        # ie maxval tells us, how many labels have the highest probability to be part of a certain entity
        # total_count tells us how many tokens have a prob > 0, 100perc tells us how many of them have a prob of 100% etc.
        analytics_dict[target_label + '_maxval'] = [maxval]
        analytics_dict[target_label + '_total_count'] = [df[df[target_label] > 0].shape[0]]
        analytics_dict[target_label + '_maxval_count'] = [df[(df[target_label] == maxval) & (maxval > 0)].shape[0]]
        analytics_dict[target_label + '_100perc_count'] = [df[df[target_label] == 1.0].shape[0]]
        analytics_dict[target_label + '_75perc_count'] = [df[df[target_label] > 0.75].shape[0]]
        analytics_dict[target_label + '_50perc_count'] = [df[df[target_label] > 0.5].shape[0]]
        uniques_df = df.copy()
        uniques_df  = uniques_df.drop_duplicates(subset=['token', 'label'])
        analytics_dict[target_label + '_total_distinct_count'] = [uniques_df[uniques_df[target_label] > 0].shape[0]]
        analytics_dict[target_label + '_maxval_distinct_count'] = [uniques_df[(uniques_df[target_label] == maxval) & (maxval > 0)].shape[0]]
        analytics_dict[target_label + '_100perc_distinct_count'] = [uniques_df[uniques_df[target_label] == 1.0].shape[0]]
        analytics_dict[target_label + '_75perc_distinct_count'] = [uniques_df[uniques_df[target_label] >= 0.75].shape[0]]
        analytics_dict[target_label + '_50perc_distinct_count'] = [uniques_df[uniques_df[target_label] >= 0.5].shape[0]               ]
    analytics_df = pd.concat([analytics_df, pd.DataFrame(analytics_dict)], ignore_index = True)
    return analytics_df

def process_files(src, dest, targets):
    analytics_df = pd.DataFrame()
    total = len(glob.glob(src + '*.parquet'))
    for filename in tqdm(glob.glob(src + '*.parquet')):
        slug = filename.split('\\')[-1].split('.')[0]
        df = pd.read_parquet(filename)
        df = label_tokens(df, targets)
        analytics_df = collect_analytics(df, analytics_df, targets, slug)
        df.to_csv(dest + slug + '.tsv', '\t')
    analytics_df.to_csv('../../data/ProPublica/data_analytics.tsv', '\t')
    print('\r')
    print('Processed ' + str(i) + ' out of ' + str(total) + ' files')

In [17]:
process_files("../../src/pro_publica_analysis/thirparty/deepform/data/training/", "../../data/ProPublica/training/", targets)

 |███████████████████████████████████████████████████████████████████████████████████████████████████-| 100.0% 
Processed 8990 out of 8990 files


In [6]:
pd.read_parquet(glob.glob("../../src/pro_publica_analysis/thirparty/deepform/data/training/*.parquet")[0])

Unnamed: 0,page,x0,y0,x1,y1,token,contract_num,advertiser,flight_from,flight_to,gross_amount,tok_id,length,digitness,is_dollar,log_amount,label
0,0,423.000000,25.174000,439.447998,25.216999,Print,0.0,0.25,0.0,0.0,0.00,37,5,0.000000,0.0,0.000000,4
1,0,441.671997,25.174000,458.567993,25.216999,Date,0.0,0.25,0.0,0.0,0.00,8,4,0.000000,0.0,0.000000,4
2,0,474.001007,25.174000,505.136993,25.216999,08/09/12,0.0,0.19,0.0,0.0,0.00,0,8,0.750000,0.0,0.000000,4
3,0,533.250977,25.174000,551.931030,25.216999,Page,0.0,0.21,0.0,0.0,0.00,7,4,0.000000,0.0,0.000000,4
4,0,122.250000,39.945999,153.360001,40.015999,WCPO,0.0,0.44,0.0,0.0,0.00,786,4,0.000000,0.0,0.000000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3338,8,150.001007,652.109009,183.768997,652.145996,-----S-,0.0,0.05,0.0,0.0,0.00,49,7,0.000000,0.0,0.000000,4
3339,8,258.839996,652.943970,294.424011,652.987000,"$5,500.00",0.0,0.12,0.0,0.0,0.62,0,9,0.666667,1.0,8.612685,4
3340,8,319.678986,652.943970,335.247009,652.987000,0.0,0.0,0.16,0.0,0.0,0.62,0,4,0.750000,1.0,0.000000,4
3341,8,473.250000,668.867004,500.480011,668.921021,Totals,0.0,0.19,0.0,0.0,0.00,30,6,0.000000,0.0,0.000000,4
