Load in .tsv and check characteristics of multi-labeled spans


In [125]:
import os
import pandas as pd
import dotenv
from tqdm import tqdm

# maximum rows and colums for pandas
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

dotenv.load_dotenv('.env')

False

In [91]:
BASE_DIR = os.getenv('BASE_DIR')
SUG_STR = '2_validated_w_sugs' # 0_brat_originals, 2_validated_w_sugs
LANG = 'nl'

In [92]:
os.chdir(r''+BASE_DIR)

In [93]:
df_dict = {}
for CLASS in ['dis', 'med', 'proc', 'symp']:
    df1 = pd.read_csv(f'b1/{SUG_STR}/{LANG}/{CLASS}/tsv/dt4h_cardioccc_annotation_transfer_{LANG}_{CLASS}.tsv', sep='\t')
    df2 = pd.read_csv(f'b2/{SUG_STR}/{LANG}/{CLASS}/tsv/dt4h_cardioccc_annotation_transfer_b2_{LANG}_{CLASS}.tsv', sep='\t')
    df = pd.concat([df1, df2])
    df_dict[CLASS] = df
df_total = pd.concat(df_dict.values())

In [117]:
df_total.tag.value_counts()

tag
SYMPTOM       18253
DISEASE       17580
PROCEDURE     16157
MEDICATION     4317
Name: count, dtype: int64

In [96]:
intersecting_spans = {}
for name in tqdm(df_total.name.unique()):
    _df = df_total.query(f'name=="{name}"')

    span_tuples = zip(_df['start_span'].values, _df['end_span'].values)

    procd = set()
    intersected_spans = []
    for k1, (start1, end1) in enumerate(span_tuples):
        s1 = set(range(start1,end1))
        for k2, (start2, end2) in enumerate(span_tuples):
            if (k1 != k2): # & ((k1,k2) not in procd) & ((k2,k1) not in procd):
                s2 = set(range(start2,end2))

                if len(s1.intersection(s2))>0:
                    intersected_spans.append({'span_1': (start1, end1), 'span_2': (start2, end2)})

                procd.add((k1,k2))
                procd.add((k2,k1))
    if len(intersected_spans)>0:
        intersecting_spans[name] = intersected_spans

100%|██████████| 508/508 [00:01<00:00, 272.63it/s]


In [97]:
print(f"There are {len(intersecting_spans)} patients with overlapping spans" )

There are 31 patients with overlapping spans


In [98]:
tot = 0
for v in intersecting_spans.values():
    tot+=len(v)
print(tot)

42


In [107]:
df_total['overlap_with_dis'] = False
df_total['overlap_with_med'] = False
df_total['overlap_with_proc'] = False
df_total['overlap_with_symp'] = False


In [118]:
for _name, spanlist in tqdm(intersecting_spans.items()):
    for _spans in spanlist:
        conds1 = (df_total.name==_name) & (df_total.start_span==_spans['span_1'][0]) & (df_total.end_span==_spans['span_1'][1])
        conds2 = (df_total.name==_name) & (df_total.start_span==_spans['span_2'][0]) & (df_total.end_span==_spans['span_2'][1])

        class_1 = df_total.loc[conds1, 'tag'].tolist()
        class_2 = df_total.loc[conds2, 'tag'].tolist()

        if 'DISEASE' in class_2:
            df_total.loc[conds1, 'overlap_with_dis'] = True
        if 'MEDICATION' in class_2:
            df_total.loc[conds1, 'overlap_with_med'] = True
        if 'PROCEDURE' in class_2:
            df_total.loc[conds1, 'overlap_with_proc'] = True
        if 'SYMPTOM' in class_2:
            df_total.loc[conds1, 'overlap_with_symp'] = True 

        if 'DISEASE' in class_1:
            df_total.loc[conds2, 'overlap_with_dis'] = True
        if 'MEDICATION' in class_1:
            df_total.loc[conds2, 'overlap_with_med'] = True
        if 'PROCEDURE' in class_1:
            df_total.loc[conds2, 'overlap_with_proc'] = True
        if 'SYMPTOM' in class_1:
            df_total.loc[conds2, 'overlap_with_symp'] = True 


100%|██████████| 31/31 [00:00<00:00, 76.26it/s]


In [119]:
df_total.overlap_with_dis.value_counts()

overlap_with_dis
False    56265
True        42
Name: count, dtype: int64

In [120]:
df_total.overlap_with_med.value_counts()

overlap_with_med
False    56284
True        23
Name: count, dtype: int64

In [121]:
df_total.overlap_with_proc.value_counts()

overlap_with_proc
False    56300
True         7
Name: count, dtype: int64

In [122]:
df_total.overlap_with_symp.value_counts()

overlap_with_symp
False    56304
True         3
Name: count, dtype: int64

In [126]:
df_total[df_total[['overlap_with_dis', 
                   'overlap_with_med',
                   'overlap_with_proc',
                   'overlap_with_symp']].apply(lambda x: any(x), axis=1)].sort_values(by='name')

Unnamed: 0,name,tag,start_span,end_span,text,note,overlap_with_dis,overlap_with_med,overlap_with_proc,overlap_with_symp
1725,casos_clinicos_cardiologia1,MEDICATION,46,55,metamizol,,True,False,False,False
7430,casos_clinicos_cardiologia1,DISEASE,30,55,allergisch voor metamizol,,False,True,False,False
481,casos_clinicos_cardiologia112,MEDICATION,196,199,ASA,,True,False,False,False
2043,casos_clinicos_cardiologia112,DISEASE,180,199,Allergisch voor ASA,,False,True,False,False
776,casos_clinicos_cardiologia114,DISEASE,228,283,"zijn rechterschouder, waar hij leed aan chroni...",,False,False,True,False
696,casos_clinicos_cardiologia114,PROCEDURE,215,248,operatie aan zijn rechterschouder,,True,False,False,False
7020,casos_clinicos_cardiologia149,DISEASE,44,96,Allergisch voor amoxicilline-clavulaanzuur en ...,,False,True,False,False
1625,casos_clinicos_cardiologia149,MEDICATION,60,86,amoxicilline-clavulaanzuur,,True,False,False,False
5982,casos_clinicos_cardiologia163,DISEASE,84,109,allergie voor penicilline,,False,True,False,False
1350,casos_clinicos_cardiologia163,MEDICATION,98,109,penicilline,,True,False,False,False
