In [1]:
import os
import re
import configparser

In [2]:
os.chdir('../')

In [3]:
BASE_OUT_PATH = 'cherry_picked/'

In [4]:
models_names = ['adapter_16', 'adapter_2', 'bert_ft_2']

In [5]:
parser = configparser.ConfigParser()
parser.read('data.ini')

['data.ini']

In [6]:
N = 10

base_results_path = 'official_results/predictions/'
onoe_base_results_path = 'results/avgs_stds/'

In [7]:
def extract_files(suffixes, base_path):
    files = []
    for file in os.listdir(base_path):
        for suffix in suffixes:
            if re.search(suffix, file):
                files.append(base_path + file)
    return files

In [8]:
from collections import defaultdict
import numpy as np
def trimmed_stats(x, sampled = True):
    x_sorted = np.sort(x)[1:-1]
    return x_sorted.mean(), x_sorted.std(ddof = 1 if sampled else 0)

def extract_f1s_from_onoe(f): 
    f1s = {}
    with open(f, 'r') as inp:
        f_lines = inp.readlines()
        for l in f_lines[1:]:
            elems = l.split('\t')
            typ = elems[0].strip()
            if typ not in test_types:
                f1 = float(elems[1])
                f1s[typ] = f1
                test_types.append(typ)
    return f1s

def extract_f1s_from_file(f): 
    regex = r'_[0-9]+'
    test_types = [] 
    f1s = defaultdict(list)
    update_i = {}
    with open(f, 'r') as inp:
        f_lines = inp.readlines()
        delimiter = f_lines[0]
        model_i = 0
        for l in f_lines[1:]:
            if l != delimiter:
                elems = l.split('\t')
                typ = re.sub(regex, '', elems[0])
                if typ in update_i and update_i[typ] != model_i:
                    f1 = float(elems[3])
                    f1s[typ].append(f1)
                    update_i[typ] = model_i
                elif typ not in update_i:
                    update_i[typ] = 0
                if typ not in test_types:
                    test_types.append(typ)
            else:
                model_i += 1
    f1s = {k:trimmed_stats(v)[0] for k, v in f1s.items()}
    return f1s

In [17]:
def extract_f1_per_label(f1s, label):
    return {k: v[label] for k, v in f1s.items() if label in v}

# Analysis

In [89]:
from cherry_picker import pick, write
dataset = 'OntoNotes'

picker = pick(dataset_name = dataset, n = 20, data=parser)

read_dataset: 100%|██████████| 251039/251039 [00:04<00:00, 55418.72it/s]
reading dataset and generate sentences...: 100%|██████████| 251039/251039 [00:00<00:00, 456783.48it/s]
get classes...: 100%|██████████| 251039/251039 [00:00<00:00, 595082.65it/s]
get_class_sentences...: 100%|██████████| 89/89 [00:00<00:00, 763.99it/s]
read_dataset: 100%|██████████| 8963/8963 [00:00<00:00, 35752.30it/s]
reading dataset and generate sentences...: 100%|██████████| 8963/8963 [00:00<00:00, 448187.83it/s]
get classes...: 100%|██████████| 8963/8963 [00:00<00:00, 673924.79it/s]
get_class_sentences...: 100%|██████████| 67/67 [00:00<00:00, 15057.51it/s]
read_dataset: 100%|██████████| 528/528 [00:00<00:00, 54226.07it/s]
reading dataset and generate sentences...: 100%|██████████| 528/528 [00:00<00:00, 639501.16it/s]
get classes...: 100%|██████████| 528/528 [00:00<00:00, 687334.73it/s]
get_class_sentences...: 100%|██████████| 35/35 [00:00<00:00, 26546.23it/s]
read_dataset: 100%|██████████| 11483/11483 [00:00<0

In [105]:
label = '/other/product'

picked = picker(label)
out_directory = BASE_OUT_PATH + dataset

write(out_directory, label, picked)

### labels performance

In [106]:
suffixes = [r'tested_on_onto_filtered_with_[a-z]+_test.txt',
            r'tested_on_onto_test.txt']

#suffixes = [r'tested_on_bbn_filtered_with_[a-z]+_test.txt',
#            r'tested_on_bbn_test.txt']

#suffixes = [r'tested_on_choi_filtered_with_[a-z]+_test.txt',
#            r'tested_on_choi_test.txt']

#suffixes = [r'tested_on_figer_filtered_with_[a-z]+_test.txt',
#            r'tested_on_figer_test.txt']

#onoes_suffixes = ['OntoNotes_preds.txt']

In [107]:
files = extract_files(suffixes, base_results_path)

In [108]:
f1s = {}

for f in files:
    train_dataset = f.split('trained_on')[1].split('_')[1]
    model_name = f.split('_trained_on')[0].split('/')[-1]
    f1s[model_name + '_trained_on_' + train_dataset] = extract_f1s_from_file(f)

In [109]:
extract_f1_per_label(f1s, label)

{'adapter_16_trained_on_bbn': 0.215525,
 'adapter_16_trained_on_choi': 0.618675,
 'adapter_16_trained_on_figer': 0.4262,
 'adapter_16_trained_on_onto': 0.0,
 'adapter_2_trained_on_bbn': 0.21702500000000002,
 'adapter_2_trained_on_choi': 0.5823,
 'adapter_2_trained_on_figer': 0.36430000000000007,
 'adapter_2_trained_on_onto': 0.0,
 'bert_ft_0_trained_on_bbn': 0.0874,
 'bert_ft_0_trained_on_choi': 0.432875,
 'bert_ft_0_trained_on_figer': 0.15425,
 'bert_ft_0_trained_on_onto': 0.10505,
 'bert_ft_2_trained_on_bbn': 0.144025,
 'bert_ft_2_trained_on_choi': 0.5556500000000001,
 'bert_ft_2_trained_on_figer': 0.29822499999999996,
 'bert_ft_2_trained_on_onto': 0.0}