Load in .tsv and check characteristics of multi-labeled spans


In [1]:
import os
import pandas as pd
import dotenv
from tqdm import tqdm

# maximum rows and colums for pandas
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

dotenv.load_dotenv('.env')

True

In [32]:
BASE_DIR = os.getenv('BASE_DIR')
SUG_STR = '1_validated_without_sugs' # 0_brat_originals, 1_validated_without_sugs, 2_validated_w_sugs
LANG = 'nl'

In [33]:
os.chdir(r''+BASE_DIR)

In [34]:
df_dict = {}
for CLASS in ['dis', 'med', 'proc', 'symp']:
    df1 = pd.read_csv(f'b1/{SUG_STR}/{LANG}/{CLASS}/tsv/dt4h_cardioccc_annotation_transfer_{LANG}_{CLASS}.tsv', sep='\t')
    df2 = pd.read_csv(f'b2/{SUG_STR}/{LANG}/{CLASS}/tsv/dt4h_cardioccc_annotation_transfer_b2_{LANG}_{CLASS}.tsv', sep='\t')
    df = pd.concat([df1, df2])
    df_dict[CLASS] = df
df_total = pd.concat(df_dict.values())

In [35]:
df_total.tag.value_counts()

tag
SYMPTOM       18253
DISEASE       17580
PROCEDURE     16157
MEDICATION     4317
Name: count, dtype: int64

In [36]:
intersecting_spans = {}
for name in tqdm(df_total.name.unique()):
    _df = df_total.query(f'name=="{name}"')

    span_tuples = list(zip(_df['start_span'].values, _df['end_span'].values))
    procd = set()
    intersected_spans = []
    for k1, (start1, end1) in enumerate(span_tuples):
        s1 = set(range(start1,end1))
        for k2, (start2, end2) in enumerate(span_tuples):
            if (k1 != k2): # & ((k1,k2) not in procd) & ((k2,k1) not in procd):
                s2 = set(range(start2,end2))
                if len(s1.intersection(s2))>0:
                    intersected_spans.append({'span_1': (start1, end1), 'span_2': (start2, end2)})

                procd.add((k1,k2))
                procd.add((k2,k1))
    if len(intersected_spans)>0:
        intersecting_spans[name] = intersected_spans

100%|██████████| 508/508 [00:21<00:00, 23.63it/s]


In [37]:
print(f"There are {len(intersecting_spans)} documents with overlapping spans" )

There are 487 documents with overlapping spans


In [38]:
tot = 0
for v in intersecting_spans.values():
    tot+=len(v)
print(tot)

6404


In [39]:
df_total['overlap_with_dis'] = False
df_total['overlap_with_med'] = False
df_total['overlap_with_proc'] = False
df_total['overlap_with_symp'] = False


In [40]:
for _name, spanlist in tqdm(intersecting_spans.items()):
    for _spans in spanlist:
        conds1 = (df_total.name==_name) & (df_total.start_span==_spans['span_1'][0]) & (df_total.end_span==_spans['span_1'][1])
        conds2 = (df_total.name==_name) & (df_total.start_span==_spans['span_2'][0]) & (df_total.end_span==_spans['span_2'][1])

        class_1 = df_total.loc[conds1, 'tag'].tolist()
        class_2 = df_total.loc[conds2, 'tag'].tolist()

        if 'DISEASE' in class_2:
            df_total.loc[conds1, 'overlap_with_dis'] = True
        if 'MEDICATION' in class_2:
            df_total.loc[conds1, 'overlap_with_med'] = True
        if 'PROCEDURE' in class_2:
            df_total.loc[conds1, 'overlap_with_proc'] = True
        if 'SYMPTOM' in class_2:
            df_total.loc[conds1, 'overlap_with_symp'] = True 

        if 'DISEASE' in class_1:
            df_total.loc[conds2, 'overlap_with_dis'] = True
        if 'MEDICATION' in class_1:
            df_total.loc[conds2, 'overlap_with_med'] = True
        if 'PROCEDURE' in class_1:
            df_total.loc[conds2, 'overlap_with_proc'] = True
        if 'SYMPTOM' in class_1:
            df_total.loc[conds2, 'overlap_with_symp'] = True 


100%|██████████| 487/487 [00:59<00:00,  8.13it/s]


In [41]:
df_total.overlap_with_dis.value_counts()

overlap_with_dis
False    54783
True      1524
Name: count, dtype: int64

In [42]:
df_total.overlap_with_med.value_counts()

overlap_with_med
False    56020
True       287
Name: count, dtype: int64

In [43]:
df_total.overlap_with_proc.value_counts()

overlap_with_proc
False    54136
True      2171
Name: count, dtype: int64

In [44]:
df_total.overlap_with_symp.value_counts()

overlap_with_symp
False    53798
True      2509
Name: count, dtype: int64

In [64]:
df_with_overlap = df_total[df_total[['overlap_with_dis', 
                   'overlap_with_med',
                   'overlap_with_proc',
                   'overlap_with_symp']].apply(lambda x: any(x), axis=1)].sort_values(by=['name', 'start_span'])

In [71]:
print("OVERALL")
tot=(df_total[['overlap_with_dis', 'overlap_with_med', 'overlap_with_proc', 'overlap_with_symp']].sum(axis=1)>0).mean()*100
print(f"Overlap in {round(tot, 2)}% of the documents")
print("Classification\tTotal\tWith Disease\tWith Medication\tWith Procedure\tWith Symptoms")
print("-"*120)
for _tag in df_total.tag.unique():
    tot=(df_total.query(f'tag=="{_tag}"')[['overlap_with_dis', 'overlap_with_med', 'overlap_with_proc', 'overlap_with_symp']].sum(axis=1)>0).mean()*100

    dis_overlap = df_total.query(f'tag=="{_tag}"').overlap_with_dis.mean()*100
    med_overlap = df_total.query(f'tag=="{_tag}"').overlap_with_med.mean()*100
    proc_overlap = df_total.query(f'tag=="{_tag}"').overlap_with_proc.mean()*100
    symp_overlap = df_total.query(f'tag=="{_tag}"').overlap_with_symp.mean()*100

    print(f"{_tag}:\t{round(tot)}\t{round(dis_overlap,2)}\t\t{round(med_overlap,2)}\t\t{round(proc_overlap,2)}\t\t{round(symp_overlap,2)}")

print("\n\n")
print("For documents with overlap")
print("Classification\tTotal\tWith Disease\tWith Medication\tWith Procedure\tWith Symptoms")
print("-"*120)
for _tag in df_with_overlap.tag.unique():
    tot=(df_with_overlap.query(f'tag=="{_tag}"')[['overlap_with_dis', 'overlap_with_med', 'overlap_with_proc', 'overlap_with_symp']].sum(axis=1)>0).mean()*100

    dis_overlap = df_with_overlap.query(f'tag=="{_tag}"').overlap_with_dis.mean()*100
    med_overlap = df_with_overlap.query(f'tag=="{_tag}"').overlap_with_med.mean()*100
    proc_overlap = df_with_overlap.query(f'tag=="{_tag}"').overlap_with_proc.mean()*100
    symp_overlap = df_with_overlap.query(f'tag=="{_tag}"').overlap_with_symp.mean()*100

    print(f"{_tag}:\t{round(tot)}\t{round(dis_overlap,2)}\t\t{round(med_overlap,2)}\t\t{round(proc_overlap,2)}\t\t{round(symp_overlap,2)}")

OVERALL
Overlap in 10.44% of the documents
Classification	Total	With Disease	With Medication	With Procedure	With Symptoms
------------------------------------------------------------------------------------------------------------------------
DISEASE:	7	2.71		0.46		2.2		3.32
MEDICATION:	7	2.22		0.88		4.31		0.23
PROCEDURE:	14	2.31		0.99		1.63		9.26
SYMPTOM:	11	3.17		0.05		7.32		2.3



For documents with overlap
Classification	Total	With Disease	With Medication	With Procedure	With Symptoms
------------------------------------------------------------------------------------------------------------------------
DISEASE:	100	36.59		6.15		29.67		44.81
MEDICATION:	100	31.48		12.46		60.98		3.28
PROCEDURE:	100	17.12		7.33		12.04		68.5
SYMPTOM:	100	27.68		0.43		63.98		20.11
