In [126]:
import pandas as pd
import numpy as np
import os, re

from nltk.tokenize import RegexpTokenizer
from flair.data import Sentence, Token

import pickle
from tqdm.auto import tqdm

In [127]:
text_dir = 'datasets\CADEC\CADEC.v2\cadec/text'
original_dir = 'datasets\CADEC\CADEC.v2\cadec/original'
sct_dir = 'datasets\CADEC\CADEC.v2\cadec/sct'


suf = '_v1'

data_save_dir = 'data'+suf


# assert not os.path.isdir(data_save_dir)
# os.mkdir(data_save_dir)


In [128]:
def add_headers(df, name):
    n = name + '_header'
    df[n] = ''
    
    for file in pd.unique(df.file):
        groups = df.loc[(df.file==file) & (~df.item.isna()),:].groupby('item')
        
        for _, group in groups:
            indices = group.index.tolist()
            # print(group, indices)
            if len(indices) == 1:
                df.at[indices[0], n] = 'B'  #singleton
            else:
                df.at[indices[0], n] = 'B' # start
                df.at[indices[-1], n] = 'E' # end
                for i in indices[1:-1]:
                    df.at[i, n] = 'I' # between 

def reorder_items(df, df_loc, change, basedon):
    sequence_dict = {}
    # df = df.copy()
    if change not in df.columns:
        df[change] = np.nan
    for i in df_loc.index:
        row = df.loc[i]
        item = row[basedon]
        if item in sequence_dict:
            sequence_id = sequence_dict[item]
        else:
            sequence_id = 'T{}'.format(len(sequence_dict) + 1)
            sequence_dict[item] = sequence_id
        df.loc[i, change] = sequence_id

def add_overlaps(df):
    n = len(df)
    # loop through rows and check for overlaps
    for i in range(1, n):
        # check if current start position is less than previous end position (equal is not overlapping)
        if df.at[i, "start"] < df.loc[i-1, "end"]:
            if df.at[i-1, "overlap"] == 0:
                df.loc[i-1, "overlap"] = 1
            df.loc[i, "overlap"] = df.loc[i-1, "overlap"] + 1
            
            if (df.at[i, "start"] != df.at[i-1, "start"]) or (df.at[i, "end"] != df.at[i-1, "end"]):
                df.loc[i-1, "shifted"] = True
                df.loc[i, "shifted"] = True
                
        elif df.at[i, "start"] > df.loc[i-1, "end"]:
            # add non-tagged sections
            df = pd.concat([df, pd.DataFrame([{'start':df.at[i-1, "end"],'end':df.at[i, "start"]}],index=[0])], ignore_index=True)
    return df.sort_values(by=["start","end"]).reset_index(drop=True)

In [129]:
# pattern = r"\b\w+\b|[^\w\s]{1}"
pattern = r"\b\w+\b|[^\w \t]{1}"

def my_word_tokenize(text):
    tokenizer = RegexpTokenizer(pattern)
    tokens = tokenizer.tokenize(text)
    return tokens

def get_word_indices(row):
    matches = list(re.finditer(pattern, row.text))
    indices = [(m.start()+row.start, m.end()+row.start) for m in matches]
    starts, ends = zip(*indices) if indices else ([], [])
    return pd.Series({'starts': starts, 'ends': ends})


# contractions_pattern = re.compile(r"\b(can't|won't|wouldn't|shouldn't|doesn't|didn't|haven't|hasn't|isn't|aren't|i'm|you're|he's|she's|it's|we're|they're)\b")

# def process_text(text):
            
#     text = text.lower()
    
#     text = contractions_pattern.sub(lambda match: match.group(0).replace("'", ""), text)


#     return text

    
    


In [130]:
text_concat = ''

# get a list of filenames in the 'text' directory
filenames = [a.strip('.txt') for a in os.listdir(text_dir)]


if not os.path.isfile(data_save_dir+'/cadec.pkl'):
    df_all = pd.DataFrame()
    for filename in tqdm(filenames, total=len(filenames)):
        if os.stat(os.path.join(text_dir, filename + '.txt')).st_size == 0 or os.stat(os.path.join(original_dir, filename + '.ann')).st_size == 0 or \
            os.stat(os.path.join(sct_dir, filename + '.ann')).st_size == 0:
            continue
        
        # print(filename)
        df_text = open(os.path.join(text_dir, filename + '.txt'), encoding='latin').read().lower()
        
        text_concat += df_text
        
        df_original = pd.read_csv(os.path.join(original_dir, filename + '.ann'), sep='\t', header=None, encoding='latin', names=['item', 'temp', 'annotated_phrase'], on_bad_lines='skip')
        df_original = df_original[~df_original['item'].str.contains('#')]
        
        for i in df_original.itertuples():
            if ';' in i.temp:
                startends = df_original.loc[df_original.item == i.item, 'temp'].item().split(' ', 1)[1].split(';')
                k = 0
                for j in startends:
                    t = df_original.loc[df_original.item == i.item].head(1)
                    t.temp = df_original.loc[df_original.item == i.item, 'temp'].head(1).item().split(' ', 1)[0] + ' ' + j
                    df_original = pd.concat([df_original, t]).reset_index(drop=True)
                    k += 1
        
        df_original = df_original[~df_original.temp.str.contains(';')]
        df_original[['ner', 'start', 'end']] = df_original.temp.str.split(' ', expand=True)
        df_original.start = df_original.start.astype(int)
        df_original.end = df_original.end.astype(int)
        df_original.drop('temp',axis=1,inplace=True)
        df_original = df_original.sort_values(by=["start","end"]).reset_index(drop=True)
        
        df_original["overlap"] = 0
        df_original["shifted"] = False
        
        df_original = add_overlaps(df_original)
        
        if df_original.at[0, 'start'] != 0:
            df_original = pd.concat([df_original,pd.DataFrame([{'start':0,'end':df_original.at[0, 'start']}],index=[0])], ignore_index=True).sort_values(by=["start","end"]).reset_index(drop=True)
        
        n = len(df_text)
        i = df_original.index[-1]
        
        if df_original.at[i, 'end'] < n:
            df_original = pd.concat([df_original,pd.DataFrame([{'start':df_original.at[i, 'end'],'end':n}],index=[0])], ignore_index=True).sort_values(by=["start","end"]).reset_index(drop=True)

        # print(df_original.sort_values(['start','end']))
        df_original['text'] = [df_text[int(start):int(end)] for start,end in zip(df_original.start, df_original.end)]
        df_original['text_tok'] = df_original.text.apply(lambda x: my_word_tokenize(x))
        df_original = pd.concat([df_original, df_original.apply(get_word_indices, axis=1)], axis=1)

        df_sct = pd.read_csv(os.path.join(sct_dir, filename + '.ann'), sep='\t', header=None, encoding='latin', names=['item', 'meddra', 'annotated_phrase'])
        df_sct['meddra'].fillna('',inplace=True)
        df_sct.loc[df_sct.meddra.str.contains('|'),'meddra'] = df_sct.loc[df_sct.meddra.str.contains('|'),'meddra'].str.split('|').str[1]
        df_sct.meddra.fillna('CONCEPT_LESS',inplace=True)
        
        df_sct['item'] = df_sct['item'].str[1:]
        df_sct.drop('annotated_phrase', axis=1, inplace=True)

        df_merged = pd.merge(df_original, df_sct, on='item', how='left')
        df_merged['file'] = filename
        
        reorder_items(df_merged, df_merged.loc[~df_merged.item.isna(),:], 'item', 'item')
        
        # find original file errors
        df_merged_tok = []
        for i in df_merged.text_tok:
            df_merged_tok += i
        
        df_text_tok = set(my_word_tokenize(df_text))
        df_merged_tok = set(df_merged_tok)
        
        text_merged = df_text_tok - df_merged_tok
        merged_text = df_merged_tok - df_text_tok
        
        if text_merged or merged_text:
            print('\t',filename)
            print([a for a in df_text_tok if a in text_merged])
            print([a for a in df_merged_tok if a in merged_text])
            

        
        df_all = pd.concat([df_all, df_merged], ignore_index=True)
        
    df_all = df_all.applymap(lambda x: x.strip() if isinstance(x, str) else x) # remove trailing spaces
    df_all.loc[:,['ner','meddra']] = df_all.loc[:,['ner','meddra']].fillna('O')
    df_all.overlap = df_all.overlap.fillna(0) 
    df_all.meddra = df_all.meddra.str.replace(' ','_')
    df_all.ner = df_all.ner.str.replace(' ','_')
    df_all.to_csv(data_save_dir+'/cadec.csv', index=False)
    df_all.to_pickle(data_save_dir+'/cadec.pkl')
else:
    df_all = pd.read_pickle(data_save_dir+'/cadec.pkl')

# make sure no punctuation is missing
print(''.join(sorted(set(''.join(sorted(pd.unique(df_all.loc[df_all.text_tok.str.len() == 0, 'text'])))))))

df_all




Unnamed: 0,item,annotated_phrase,ner,start,end,overlap,shifted,text,text_tok,starts,ends,meddra,file
0,,,O,0,9,0.0,,i feel a,"[i, feel, a]","(0, 2, 7)","(1, 6, 8)",O,ARTHROTEC.1
1,T1,bit drowsy,ADR,9,19,0.0,False,bit drowsy,"[bit, drowsy]","(9, 13)","(12, 19)",Drowsy,ARTHROTEC.1
2,,,O,19,29,0.0,,& have a,"[&, have, a]","(20, 22, 27)","(21, 26, 28)",O,ARTHROTEC.1
3,T2,little blurred vision,ADR,29,50,0.0,False,little blurred vision,"[little, blurred, vision]","(29, 36, 44)","(35, 43, 50)",Blurred_vision_-_hazy,ARTHROTEC.1
4,,,O,50,62,0.0,,", so far no","[,, so, far, no]","(50, 52, 55, 59)","(51, 54, 58, 61)",O,ARTHROTEC.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20532,,,O,408,528,0.0,,"there seems to be the occasional flare up, but...","[there, seems, to, be, the, occasional, flare,...","(409, 415, 421, 424, 427, 431, 442, 448, 450, ...","(414, 420, 423, 426, 430, 441, 447, 450, 451, ...",O,ZIPSOR.5
20533,T11,Zipsor,Drug,528,534,0.0,False,zipsor,[zipsor],"(528,)","(534,)",diclofenac,ZIPSOR.5
20534,,,O,534,540,0.0,,", the","[,, the]","(534, 536)","(535, 539)",O,ZIPSOR.5
20535,T12,pain,Symptom,540,544,0.0,False,pain,[pain],"(540,)","(544,)",Pain,ZIPSOR.5


### made corrections to original files
original files in folder: `datasets\CADEC\CADEC.v2\cadec\original - unedited version of edited files`

afterwards, 2 cells below should return empty set

In [131]:
res = []
for i in df_all.text_tok:
    res += i
# res

# 37
set(res) - set(my_word_tokenize(text_concat))

{'they',
 'fourth',
 'lifestyle',
 'worrying',
 'vitorin',
 '600',
 'weather',
 'brathing',
 '70s',
 '6months',
 'although',
 'contacting',
 '4',
 'stepped',
 'osophragus',
 'spazms',
 'hooked',
 'thib',
 'nomal',
 'motrin',
 'bags',
 'mestrual',
 'neuropothy',
 'blured',
 'curled',
 'opposite',
 'happends',
 'pending',
 'life',
 'hormonally',
 '40',
 'fraterity',
 'require',
 'leaking',
 'jack',
 'ne',
 'nine',
 'disk',
 'pardon',
 'orthotics',
 '1996',
 'larynx',
 'stating',
 'artritic',
 'ibu',
 '96',
 'mom',
 'vulnerable',
 'crabby',
 'subsequently',
 'working',
 'distracting',
 'affect',
 'coast',
 'tells',
 'possibly',
 'printout',
 'pills',
 'gift',
 'baltimore',
 'who',
 'despite',
 'nasonex',
 'flatulence',
 'abrupt',
 'cupboard',
 'menopause',
 'walking',
 '230',
 'investigating',
 'australianstatinvictims',
 'plunged',
 'damaging',
 'alternating',
 'air',
 'mos',
 'dying',
 'supply',
 'instaed',
 'confussion',
 'dec',
 'march',
 'hyperactive',
 'bypass',
 '51',
 'terribe',
 

In [132]:
# 121
set(my_word_tokenize(text_concat)) - set(res)

set()

### shifted overlaps (start and/or end dont match)

In [133]:
df_all.loc[(df_all.shifted == True) & (df_all.overlap > 0),:]
# -- df_all.loc[10121,'end'] = 219 # edit data directly instead / ignore

Unnamed: 0,item,annotated_phrase,ner,start,end,overlap,shifted,text,text_tok,starts,ends,meddra,file
2385,T8,renal and respiratory failure,ADR,421,440,1.0,True,respiratory failure,"[respiratory, failure]","(421, 433)","(432, 440)",Respiratory_failure,DICLOFENAC-SODIUM.7
2386,T7,renal failure,ADR,432,440,2.0,True,failure,[failure],"(433,)","(440,)",Renal_failure,DICLOFENAC-SODIUM.7
20219,T3,pain in stomach,ADR,215,222,1.0,True,pain in,"[pain, in]","(215, 220)","(219, 222)",Stomach_ache,VOLTAREN.34
20220,T4,pain in my kidney area,ADR,215,237,2.0,True,pain in my kidney area,"[pain, in, my, kidney, area]","(215, 220, 223, 226, 233)","(219, 222, 225, 232, 237)",Renal_pain,VOLTAREN.34


In [134]:
# number of files with overlaps
len(pd.unique(df_all.loc[df_all.overlap > 0,'file']))

312

## 8 is the max tags. need to test cutoffs
### max number of tags --> determines how many conll columns

In [135]:
# same results for 'ner' as well
df_all.loc[df_all.overlap > 0,:].groupby(['file','start','end'])['meddra'].count().sort_values(ascending=False).head(60)

file           start  end
LIPITOR.997    0      7      8
LIPITOR.620    24     26     6
LIPITOR.794    16     18     6
LIPITOR.207    29     46     6
               283    300    6
LIPITOR.787    885    891    6
LIPITOR.662    89     91     6
               76     80     6
LIPITOR.573    17     25     6
LIPITOR.851    224    226    6
LIPITOR.668    100    104    5
LIPITOR.92     17     28     5
LIPITOR.592    102    106    5
LIPITOR.574    94     100    5
LIPITOR.989    30     46     5
ARTHROTEC.105  60     68     5
LIPITOR.574    59     70     5
LIPITOR.669    100    104    5
LIPITOR.339    111    120    4
LIPITOR.204    221    235    4
LIPITOR.470    123    127    4
LIPITOR.246    226    233    4
LIPITOR.556    182    198    4
LIPITOR.840    77     82     4
LIPITOR.273    66     77     4
LIPITOR.531    30     32     4
LIPITOR.274    91     100    4
LIPITOR.281    27     32     4
LIPITOR.588    0      7      4
LIPITOR.748    0      13     4
LIPITOR.575    44     46     4
LIPITOR.567  

## Create exploded dataframe for conll format

In [136]:
if not os.path.isfile(data_save_dir+'/cadec_exploded.pkl'):
        
    df_exploded = df_all.drop(['annotated_phrase','text'],axis=1)
    df_exploded = df_exploded.explode('text_tok')[['file', 'item', 'start', 'end', 'text_tok', 'ner', 'meddra', 'overlap', 'shifted']].reset_index(drop=True)
    # df_exploded = df_exploded.explode('text_tok')[['file', 'item', 'start', 'end', 'text_tok', 'ner', 'ner_header', 'meddra', 'meddra_header', 'overlap']]

    starts_exploded = df_all['starts'].explode().reset_index(drop=True)
    ends_exploded = df_all['ends'].explode().reset_index(drop=True)

    # add headers
    add_headers(df_exploded,'ner')
    add_headers(df_exploded,'meddra')

    assert df_exploded.ner_header.equals(df_exploded.meddra_header)
    df_exploded = df_exploded.drop('meddra_header',axis=1).rename(columns={'ner_header':'header'})

    # print(df_exploded)
    df_exploded = pd.concat([df_exploded, starts_exploded, ends_exploded], axis=1)
    df_exploded = df_exploded.loc[~df_exploded.text_tok.isna(),:].reset_index(drop=True)
    df_exploded.loc[df_exploded.text_tok =='\n',['text_tok','ner','meddra']] = '##' # represents sentence boundaries
    df_exploded.starts = df_exploded.starts.astype(int)
    df_exploded.ends = df_exploded.ends.astype(int)

    # # check
    # df_exploded['tok'] = None
    # for filename in tqdm(filenames, total=len(filenames)):
    #     df_text = open(os.path.join(text_dir, filename + '.txt'), encoding='latin').read().lower()
    #     i = df_exploded.file == filename
    #     df_exploded.loc[i, 'tok'] = [df_text[int(start):int(end)] for start,end in zip(df_exploded.loc[i, 'starts'], df_exploded.loc[i, 'ends'])]
    
    
    df_exploded.to_csv(data_save_dir+'/cadec_exploded.csv', index=False)
    df_exploded.to_pickle(data_save_dir+'/cadec_exploded.pkl')
else:
    df_exploded = pd.read_pickle(data_save_dir+'/cadec_exploded.pkl')
    df_exploded.starts = df_exploded.starts.astype(int)
    df_exploded.ends = df_exploded.ends.astype(int)
    
    
df_exploded


Unnamed: 0,file,item,start,end,text_tok,ner,meddra,overlap,shifted,header,starts,ends
0,ARTHROTEC.1,,0,9,i,O,O,0.0,,,0,1
1,ARTHROTEC.1,,0,9,feel,O,O,0.0,,,2,6
2,ARTHROTEC.1,,0,9,a,O,O,0.0,,,7,8
3,ARTHROTEC.1,T1,9,19,bit,ADR,Drowsy,0.0,False,B,9,12
4,ARTHROTEC.1,T1,9,19,drowsy,ADR,Drowsy,0.0,False,E,13,19
...,...,...,...,...,...,...,...,...,...,...,...,...
129940,ZIPSOR.5,,544,601,recognizable,O,O,0.0,,,574,586
129941,ZIPSOR.5,,544,601,side,O,O,0.0,,,587,591
129942,ZIPSOR.5,,544,601,effects,O,O,0.0,,,592,599
129943,ZIPSOR.5,,544,601,.,O,O,0.0,,,599,600


### made corrections to original files
original files in folders
- `datasets\CADEC\CADEC.v2\cadec\sct - unedited version of edited files`
- `datasets\CADEC\CADEC.v2\cadec\meddra - unedited version of edited files` (not needed)

afterwards, cell below should be empty

In [137]:
df_exploded.loc[(df_exploded.meddra == 'O') & ((df_exploded.ner != 'O')),:]

Unnamed: 0,file,item,start,end,text_tok,ner,meddra,overlap,shifted,header,starts,ends


In [138]:
# fixed another 3 files
df_exploded.loc[(df_exploded.meddra == 'O'),:]

Unnamed: 0,file,item,start,end,text_tok,ner,meddra,overlap,shifted,header,starts,ends
0,ARTHROTEC.1,,0,9,i,O,O,0.0,,,0,1
1,ARTHROTEC.1,,0,9,feel,O,O,0.0,,,2,6
2,ARTHROTEC.1,,0,9,a,O,O,0.0,,,7,8
5,ARTHROTEC.1,,19,29,&,O,O,0.0,,,20,21
6,ARTHROTEC.1,,19,29,have,O,O,0.0,,,22,26
...,...,...,...,...,...,...,...,...,...,...,...,...
129939,ZIPSOR.5,,544,601,no,O,O,0.0,,,571,573
129940,ZIPSOR.5,,544,601,recognizable,O,O,0.0,,,574,586
129941,ZIPSOR.5,,544,601,side,O,O,0.0,,,587,591
129942,ZIPSOR.5,,544,601,effects,O,O,0.0,,,592,599


In [139]:
df_exploded.loc[(df_exploded.shifted == True) & (df_exploded.overlap > 0),:]

Unnamed: 0,file,item,start,end,text_tok,ner,meddra,overlap,shifted,header,starts,ends
15223,DICLOFENAC-SODIUM.7,T8,421,440,respiratory,ADR,Respiratory_failure,1.0,True,B,421,432
15224,DICLOFENAC-SODIUM.7,T8,421,440,failure,ADR,Respiratory_failure,1.0,True,E,433,440
15225,DICLOFENAC-SODIUM.7,T7,432,440,failure,ADR,Renal_failure,2.0,True,E,433,440
127926,VOLTAREN.34,T3,215,222,pain,ADR,Stomach_ache,1.0,True,B,215,219
127927,VOLTAREN.34,T3,215,222,in,ADR,Stomach_ache,1.0,True,I,220,222
127928,VOLTAREN.34,T4,215,237,pain,ADR,Renal_pain,2.0,True,B,215,219
127929,VOLTAREN.34,T4,215,237,in,ADR,Renal_pain,2.0,True,I,220,222
127930,VOLTAREN.34,T4,215,237,my,ADR,Renal_pain,2.0,True,I,223,225
127931,VOLTAREN.34,T4,215,237,kidney,ADR,Renal_pain,2.0,True,I,226,232
127932,VOLTAREN.34,T4,215,237,area,ADR,Renal_pain,2.0,True,E,233,237


## average number of unqiue meddras per document

In [140]:
df_exploded.groupby(['file'])['meddra'].nunique().mean()

8.313659359190556

In [141]:
for i in df_exploded.columns:
    print(i, len(pd.unique(df_exploded[i])))

file 1186
item 54
start 1372
end 1435
text_tok 6627
ner 7
meddra 1043
overlap 9
shifted 3
header 4
starts 2583
ends 2583


## parameter to test in preprocessing step
## `dont filter out '##' sentence boundaries`
### most likely will filter out by counts_exploded

In [142]:
df_exploded[['ner','meddra']].value_counts().reset_index()

Unnamed: 0,ner,meddra,count
0,O,O,101260
1,##,##,7489
2,Drug,Lipitor,1081
3,ADR,CONCEPT_LESS,1016
4,ADR,Severe_pain,722
...,...,...,...
1296,Finding,Senility,1
1297,Drug,Seroquel,1
1298,Drug,Solaraze,1
1299,Drug,Tagamet,1


In [143]:
meddra_counts = df_exploded[['ner','meddra']].value_counts().reset_index().rename(columns={'count':'counts_exploded'}). \
    merge(df_all[['ner','meddra']].value_counts().reset_index().rename(columns={'count':'counts'}), on=['ner', 'meddra'])
meddra_counts = meddra_counts[meddra_counts.ner != 'O']
meddra_counts['counts_exploded_rank'] = meddra_counts.counts_exploded.rank(ascending=False).astype(int)
meddra_counts.to_csv(data_save_dir+'/meddra_counts.csv',index=False)
meddra_counts

Unnamed: 0,ner,meddra,counts_exploded,counts,counts_exploded_rank
1,Drug,Lipitor,1081,1073,1
2,ADR,CONCEPT_LESS,1016,266,2
3,ADR,Severe_pain,722,247,3
4,ADR,Myalgia,619,305,4
5,ADR,Pain,292,260,5
...,...,...,...,...,...
1295,Finding,Senility,1,1,1233
1296,Drug,Seroquel,1,1,1233
1297,Drug,Solaraze,1,1,1233
1298,Drug,Tagamet,1,1,1233


In [144]:
pd.unique(meddra_counts.counts_exploded_rank)

array([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,
         12,   13,   14,   16,   17,   18,   20,   22,   23,   24,   25,
         26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,
         37,   39,   40,   41,   43,   44,   46,   48,   49,   50,   51,
         53,   54,   56,   57,   59,   60,   62,   63,   64,   65,   66,
         68,   69,   70,   72,   73,   76,   78,   79,   81,   83,   85,
         87,   90,   93,   96,  100,  103,  108,  114,  117,  122,  127,
        133,  140,  146,  151,  155,  160,  167,  175,  185,  195,  208,
        219,  229,  240,  249,  257,  269,  286,  305,  329,  355,  387,
        425,  464,  517,  588,  687,  829, 1039, 1233])

In [145]:
meddra_counts.loc[meddra_counts.ner == 'ADR',:]

Unnamed: 0,ner,meddra,counts_exploded,counts,counts_exploded_rank
2,ADR,CONCEPT_LESS,1016,266,2
3,ADR,Severe_pain,722,247,3
4,ADR,Myalgia,619,305,4
5,ADR,Pain,292,260,5
6,ADR,Arthralgia,280,163,6
...,...,...,...,...,...
1269,ADR,Labyrinthitis,1,1,1233
1270,ADR,Dysphagia,1,1,1233
1276,ADR,Lupus_erythematosus,1,1,1233
1279,ADR,Cataract,1,1,1233


In [146]:
meddra_counts.loc[meddra_counts.meddra == 'CONCEPT_LESS',:]

Unnamed: 0,ner,meddra,counts_exploded,counts,counts_exploded_rank
2,ADR,CONCEPT_LESS,1016,266,2
13,Drug,CONCEPT_LESS,211,144,13
40,Finding,CONCEPT_LESS,89,34,40
91,Symptom,CONCEPT_LESS,42,15,90
121,Disease,CONCEPT_LESS,32,17,122


In [147]:
df_exploded

Unnamed: 0,file,item,start,end,text_tok,ner,meddra,overlap,shifted,header,starts,ends
0,ARTHROTEC.1,,0,9,i,O,O,0.0,,,0,1
1,ARTHROTEC.1,,0,9,feel,O,O,0.0,,,2,6
2,ARTHROTEC.1,,0,9,a,O,O,0.0,,,7,8
3,ARTHROTEC.1,T1,9,19,bit,ADR,Drowsy,0.0,False,B,9,12
4,ARTHROTEC.1,T1,9,19,drowsy,ADR,Drowsy,0.0,False,E,13,19
...,...,...,...,...,...,...,...,...,...,...,...,...
129940,ZIPSOR.5,,544,601,recognizable,O,O,0.0,,,574,586
129941,ZIPSOR.5,,544,601,side,O,O,0.0,,,587,591
129942,ZIPSOR.5,,544,601,effects,O,O,0.0,,,592,599
129943,ZIPSOR.5,,544,601,.,O,O,0.0,,,599,600


In [148]:
meddra_counts.drop('ner',axis=1)

Unnamed: 0,meddra,counts_exploded,counts,counts_exploded_rank
1,Lipitor,1081,1073,1
2,CONCEPT_LESS,1016,266,2
3,Severe_pain,722,247,3
4,Myalgia,619,305,4
5,Pain,292,260,5
...,...,...,...,...
1295,Senility,1,1,1233
1296,Seroquel,1,1,1233
1297,Solaraze,1,1,1233
1298,Tagamet,1,1,1233


In [149]:
df_exploded.merge(meddra_counts.drop('ner',axis=1), on=['meddra'], how='left')

Unnamed: 0,file,item,start,end,text_tok,ner,meddra,overlap,shifted,header,starts,ends,counts_exploded,counts,counts_exploded_rank
0,ARTHROTEC.1,,0,9,i,O,O,0.0,,,0,1,,,
1,ARTHROTEC.1,,0,9,feel,O,O,0.0,,,2,6,,,
2,ARTHROTEC.1,,0,9,a,O,O,0.0,,,7,8,,,
3,ARTHROTEC.1,T1,9,19,bit,ADR,Drowsy,0.0,False,B,9,12,19.0,14.0,219.0
4,ARTHROTEC.1,T1,9,19,drowsy,ADR,Drowsy,0.0,False,E,13,19,19.0,14.0,219.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151835,ZIPSOR.5,,544,601,recognizable,O,O,0.0,,,574,586,,,
151836,ZIPSOR.5,,544,601,side,O,O,0.0,,,587,591,,,
151837,ZIPSOR.5,,544,601,effects,O,O,0.0,,,592,599,,,
151838,ZIPSOR.5,,544,601,.,O,O,0.0,,,599,600,,,


In [150]:
# # # didnt change anything yet
# df_exploded = df_exploded.drop([a for a in df_exploded.columns if a.startswith('counts')],axis=1).reset_index(drop=True) # for running the cell multiple times
# df_exploded = df_exploded.merge(meddra_counts.drop('ner',axis=1), on='meddra', how='left')
# df_exploded

In [151]:
df_exploded

Unnamed: 0,file,item,start,end,text_tok,ner,meddra,overlap,shifted,header,starts,ends
0,ARTHROTEC.1,,0,9,i,O,O,0.0,,,0,1
1,ARTHROTEC.1,,0,9,feel,O,O,0.0,,,2,6
2,ARTHROTEC.1,,0,9,a,O,O,0.0,,,7,8
3,ARTHROTEC.1,T1,9,19,bit,ADR,Drowsy,0.0,False,B,9,12
4,ARTHROTEC.1,T1,9,19,drowsy,ADR,Drowsy,0.0,False,E,13,19
...,...,...,...,...,...,...,...,...,...,...,...,...
129940,ZIPSOR.5,,544,601,recognizable,O,O,0.0,,,574,586
129941,ZIPSOR.5,,544,601,side,O,O,0.0,,,587,591
129942,ZIPSOR.5,,544,601,effects,O,O,0.0,,,592,599
129943,ZIPSOR.5,,544,601,.,O,O,0.0,,,599,600


## Aggregate overlaps into lists

In [152]:
df_exploded2 = df_exploded.copy()

if not os.path.isfile(data_save_dir+'/cadec_exploded_2.pkl'):

    for filename in tqdm(filenames, total=len(filenames)):
        temp = df_exploded2.loc[df_exploded2.file==filename,['text_tok','starts','ends','ner','meddra','header']]
        temp = temp.loc[temp.duplicated(subset=['starts','ends'],keep=False),:]
        # temp.loc[temp.text_tok =='##',:]
        if len(temp) != 0:
            # print('='*20)
            # print(temp)
            i = set(temp.index)
            replace_indices = []
            for group in temp.groupby(['starts','ends']):
                replace_indices.append(group[1].index[0])
            drop_indices = sorted(i-set(replace_indices))
            temp2 = temp.groupby(['starts','ends']).agg(lambda x: x).reset_index(drop=True).drop('text_tok',axis=1)
            temp2.index = replace_indices
            temp.update(temp2)
            temp.drop_duplicates(['starts','ends'], inplace=True)
            df_exploded2.drop(drop_indices, inplace=True)
            df_exploded2.update(temp)
            # print('-'*20)
            # print(temp)
            # print('='*20)
            
    df_exploded2.reset_index(drop=True, inplace=True)
    
    df_exploded2[['header','ner','meddra']] = df_exploded2[['header','ner','meddra']].applymap(lambda x: [x] if isinstance(x, str) else x)
    df_exploded2.starts = df_exploded2.starts.astype(int)
    df_exploded2.ends = df_exploded2.ends.astype(int)
    
    df_exploded2.to_csv(data_save_dir+'/cadec_exploded_2.csv', index=False)
    df_exploded2.to_pickle(data_save_dir+'/cadec_exploded_2.pkl')
else:
    df_exploded2 = pd.read_pickle(data_save_dir+'/cadec_exploded_2.pkl')
    df_exploded2.starts = df_exploded2.starts.astype(int)
    df_exploded2.ends = df_exploded2.ends.astype(int)
    
df_exploded2

Unnamed: 0,file,item,start,end,text_tok,ner,meddra,overlap,shifted,header,starts,ends
0,ARTHROTEC.1,,0,9,i,[O],[O],0.0,,[],0,1
1,ARTHROTEC.1,,0,9,feel,[O],[O],0.0,,[],2,6
2,ARTHROTEC.1,,0,9,a,[O],[O],0.0,,[],7,8
3,ARTHROTEC.1,T1,9,19,bit,[ADR],[Drowsy],0.0,False,[B],9,12
4,ARTHROTEC.1,T1,9,19,drowsy,[ADR],[Drowsy],0.0,False,[E],13,19
...,...,...,...,...,...,...,...,...,...,...,...,...
128626,ZIPSOR.5,,544,601,recognizable,[O],[O],0.0,,[],574,586
128627,ZIPSOR.5,,544,601,side,[O],[O],0.0,,[],587,591
128628,ZIPSOR.5,,544,601,effects,[O],[O],0.0,,[],592,599
128629,ZIPSOR.5,,544,601,.,[O],[O],0.0,,[],599,600


In [153]:
list(pd.unique(df_exploded.ner))

['O', 'ADR', '##', 'Drug', 'Disease', 'Symptom', 'Finding']

In [154]:
for i in list(pd.unique(df_exploded.ner)):
    print(i, len(pd.unique(df_exploded.loc[df_exploded.ner==i,'meddra'])))

O 1
ADR 800
## 1
Drug 124
Disease 109
Symptom 83
Finding 183


In [155]:
df_exploded2

Unnamed: 0,file,item,start,end,text_tok,ner,meddra,overlap,shifted,header,starts,ends
0,ARTHROTEC.1,,0,9,i,[O],[O],0.0,,[],0,1
1,ARTHROTEC.1,,0,9,feel,[O],[O],0.0,,[],2,6
2,ARTHROTEC.1,,0,9,a,[O],[O],0.0,,[],7,8
3,ARTHROTEC.1,T1,9,19,bit,[ADR],[Drowsy],0.0,False,[B],9,12
4,ARTHROTEC.1,T1,9,19,drowsy,[ADR],[Drowsy],0.0,False,[E],13,19
...,...,...,...,...,...,...,...,...,...,...,...,...
128626,ZIPSOR.5,,544,601,recognizable,[O],[O],0.0,,[],574,586
128627,ZIPSOR.5,,544,601,side,[O],[O],0.0,,[],587,591
128628,ZIPSOR.5,,544,601,effects,[O],[O],0.0,,[],592,599
128629,ZIPSOR.5,,544,601,.,[O],[O],0.0,,[],599,600


In [156]:
# ##           7489
# .            7119
# !             205
# ?              67
# )              17
# etc            14
# ,               4
# html            3
# pain            2
# /               2
# 325nguyen       1
# ]               1
# 4               1
# 67              1
# taken           1
# lipitor         1
# it              1
# dr              1
# ago             1
# med             1
# 2               1
# pills           1
# comment         1
# drug            1
# minutes         1
# arthrotec       1
# stiffness       1
# them            1
# same            1
# weeks           1
# 58              1
# Name: text_tok, dtype: int64

In [157]:
idx = df_exploded2.index.get_indexer_for(df_exploded2[df_exploded2.text_tok	=='##'].index)
n = 1
test = df_exploded2.iloc[np.unique(np.concatenate([np.arange(max(i-n,0), min(i+n, len(df_exploded2)))
                                            for i in idx]))]
# a = ['.','!','?']
test.loc[~test.text_tok.isin(['##','.']) & test.text_tok.isin([a for a in """\),/]"""]),['file','text_tok']]
# test.loc[~test.text_tok.str.contains("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ ]"""),:]
# test.loc[~test.text_tok.str.contains("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ ]"""),:].text_tok.value_counts()
# test
# test.text_tok.value_counts()



Unnamed: 0,file,text_tok
3577,ARTHROTEC.137,)
3818,ARTHROTEC.137,)
3910,ARTHROTEC.137,)
6737,ARTHROTEC.30,","
9723,ARTHROTEC.59,/
10443,ARTHROTEC.63,/
15582,LIPITOR.10,","
27880,LIPITOR.187,)
27951,LIPITOR.187,)
38150,LIPITOR.273,)


In [158]:
def process_token(df):
    prev_tokens = list(df.text_tok)
    # print(df.index)
    # print(prev_tokens, token)
    # If '.' or '?' or '!' before '##', drop the ##
    if prev_tokens[-2] in '.!?':
        return [df.index[-1]]
    
    # If ')',']', or an alphanumeric token before '##', do nothing (## already replaced by .)
    if prev_tokens[-2].isalnum() or prev_tokens[-2] in '])':
        return []

    # If ',','.' are the 1-2 rows before '##', remove the row with ','
    if len(prev_tokens) >= 3 and prev_tokens[-3] == ',' and prev_tokens[-2] == '.':
        return [df.index[-3]]
    
    # If any other punctuation before '##', remove it
    if re.match(r'[^\w\s]', prev_tokens[-2]):
        return [df.index[-2]]
    
    # # Otherwise, do nothing
    # return []

def compute_starts(df):
    df['starts'] = 0
    start_pos = 0
    for _, row in enumerate(df.itertuples(), 0):
        idx = row.Index - df.index[0]
        if idx > 0:
            start_pos += 1
        df.at[row.Index, 'starts'] = start_pos
        start_pos += len(row.text_tok)

In [159]:
if not os.path.isfile(data_save_dir+'/cadec_exploded_3.pkl'):
    df_exploded3 = df_exploded2.copy()
    # df_exploded3.drop([a for a in df_exploded3.columns if a.startswith('counts')],axis=1,inplace=True)
    count_columns = [a for a in df_exploded3.columns if a.startswith('counts')]
    
    rows_to_drop = []
    for filename, doc in tqdm(df_exploded3.groupby('file'), total=len(filenames)):

        # Find the indices of '##' tokens in the current document
        indices = doc.index[doc['text_tok'] == '##']

        # Replace each '##' token with '.'
        for idx in indices:
            doc.loc[idx, 'text_tok'] = '.'
            doc.loc[idx, 'ner'] = ['O']
            doc.loc[idx, 'meddra'] = ['O']
            doc.loc[idx, count_columns] = np.NaN
        
        indices_to_drop = []
        # Apply the processing function to the 1-2 rows before each '.'
        for idx in indices:
            prev_indices = list(range(max(0, idx-2), idx)) + [idx]
            res = process_token(doc.loc[prev_indices,:])
            rows_to_drop.extend(res)
            indices_to_drop.extend(res)
        
        doc.drop(indices_to_drop, inplace=True)
        compute_starts(doc)
        df_exploded3.update(doc)

    df_exploded3.drop(rows_to_drop, inplace=True)
    df_exploded3.drop('ends', axis=1, inplace=True)

    df_exploded3.reset_index(drop=True,inplace=True)
    df_exploded3.starts = df_exploded3.starts.astype(int)
    df_exploded3.to_csv(data_save_dir+'/cadec_exploded_3.csv', index=False)
    df_exploded3.to_pickle(data_save_dir+'/cadec_exploded_3.pkl')
else:
    df_exploded3 = pd.read_pickle(data_save_dir+'/cadec_exploded_3.pkl')
    df_exploded3.starts = df_exploded3.starts.astype(int)
df_exploded3

Unnamed: 0,file,item,start,end,text_tok,ner,meddra,overlap,shifted,header,starts
0,ARTHROTEC.1,,0,9,i,[O],[O],0.0,,[],0
1,ARTHROTEC.1,,0,9,feel,[O],[O],0.0,,[],2
2,ARTHROTEC.1,,0,9,a,[O],[O],0.0,,[],7
3,ARTHROTEC.1,T1,9,19,bit,[ADR],[Drowsy],0.0,False,[B],9
4,ARTHROTEC.1,T1,9,19,drowsy,[ADR],[Drowsy],0.0,False,[E],13
...,...,...,...,...,...,...,...,...,...,...,...
121194,ZIPSOR.5,,544,601,no,[O],[O],0.0,,[],583
121195,ZIPSOR.5,,544,601,recognizable,[O],[O],0.0,,[],586
121196,ZIPSOR.5,,544,601,side,[O],[O],0.0,,[],599
121197,ZIPSOR.5,,544,601,effects,[O],[O],0.0,,[],604


In [160]:
# # idx = df_exploded3.index.get_indexer_for(df_exploded3[df_exploded3.text_tok	=='##'].index)
# idx = df_exploded3.index.get_indexer_for(df_exploded3[df_exploded3.counts_exploded.isna()].index)
# n = 1
# test = df_exploded3.iloc[np.unique(np.concatenate([np.arange(max(i-n,0), min(i+n, len(df_exploded3)))
#                                             for i in idx]))]
# # a = ['.','!','?']
# # test.loc[~test.text_tok.isin(['##','.']) & test.text_tok.isin([a for a in """\),/]"""]),['file','text_tok']]
# # test.loc[~test.text_tok.str.contains("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ ]"""),:]
# # test.loc[~test.text_tok.str.contains("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ ]"""),:].text_tok.value_counts()
# # test
# test.text_tok.value_counts()



## remove contractions

In [161]:
contractions = {"ain't", "aren't", "can't", "couldn't", "didn't", "doesn't", "don't", "hadn't", "hasn't", "haven't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", 
                "i'll", "i'm", "i've", "isn't", "it'd", "it'll", "it's", "let's", "might've", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", 
                "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "we've", "weren't", "what'd", "what's", "when'd", "when'll", 
                "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"}

def fix_contractions(df):
    # Loop through the tokens and start indices in the DataFrame
    rows_to_remove = []
    i = df.index[0] + 1
    while i < df.index[-1]:
        # Check if the current token is the first token in a contraction
        current_token = df.loc[i,"text_tok"]
        if current_token == "'":
            # print(i)
            # print(df.loc[i-1])
            # Join the previous, current, and next tokens into a string
            prev_token = df.loc[i-1,"text_tok"]
            next_token = df.loc[i+1,"text_tok"]
            contraction = (prev_token + current_token + next_token).lower()
            
            # Check if the contraction exists in the list of contractions
            if contraction in contractions:
                # Combine the consecutive tokens containing the contraction into a single token
                df.loc[i-1,"text_tok"] = prev_token + next_token
                rows_to_remove.append(i+1)
                rows_to_remove.append(i)
                
                i += 1
                
        i += 1
    
    # Drop the rows containing the separate tokens of contractions
    df = df.drop(rows_to_remove)
    
    return df, rows_to_remove


In [162]:
if not os.path.isfile(data_save_dir+'/cadec_exploded_4.pkl'):
    df_exploded4 = df_exploded3.copy()
    # df_exploded4.drop([a for a in df_exploded4.columns if a.startswith('counts')],axis=1,inplace=True)
    count_columns = [a for a in df_exploded4.columns if a.startswith('counts')]
    
    rows_to_drop = []
    for filename, doc in tqdm(df_exploded4.groupby('file'), total=len(filenames)):
        doc, rows_to_drop0 = fix_contractions(doc)
        rows_to_drop.extend(rows_to_drop0)
        compute_starts(doc)
        df_exploded4.update(doc)
        
    df_exploded4.drop(rows_to_drop, inplace=True)
    df_exploded4.reset_index(drop=True,inplace=True)
    df_exploded4.starts = df_exploded4.starts.astype(int)
    df_exploded4.to_csv(data_save_dir+'/cadec_exploded_4.csv', index=False)
    df_exploded4.to_pickle(data_save_dir+'/cadec_exploded_4.pkl')
else:
    df_exploded4 = pd.read_pickle(data_save_dir+'/cadec_exploded_4.pkl')
    df_exploded4.starts = df_exploded4.starts.astype(int)
df_exploded4

Unnamed: 0,file,item,start,end,text_tok,ner,meddra,overlap,shifted,header,starts
0,ARTHROTEC.1,,0,9,i,[O],[O],0.0,,[],0
1,ARTHROTEC.1,,0,9,feel,[O],[O],0.0,,[],2
2,ARTHROTEC.1,,0,9,a,[O],[O],0.0,,[],7
3,ARTHROTEC.1,T1,9,19,bit,[ADR],[Drowsy],0.0,False,[B],9
4,ARTHROTEC.1,T1,9,19,drowsy,[ADR],[Drowsy],0.0,False,[E],13
...,...,...,...,...,...,...,...,...,...,...,...
119374,ZIPSOR.5,,544,601,no,[O],[O],0.0,,[],583
119375,ZIPSOR.5,,544,601,recognizable,[O],[O],0.0,,[],586
119376,ZIPSOR.5,,544,601,side,[O],[O],0.0,,[],599
119377,ZIPSOR.5,,544,601,effects,[O],[O],0.0,,[],604


In [163]:
docs = []

for filename, doc in tqdm(df_exploded4.groupby('file'), total=len(filenames)):
    toks = [Token(text=token, start_position=idx) for token, idx in zip(doc['text_tok'], doc['starts'])]
    span = Sentence([])
    
    for token, ner, meddra, header in zip(toks, doc['ner'], doc['meddra'], doc['header']):
        
        # v1
        pairs = sorted(set([(a,b) for a,b in zip(ner,header)]))
        for tag,h in pairs:
            if tag != 'O':
                assert h != ''
                token.add_label(typename='ner', value=h+'-'+tag)
            else:
                token.add_label(typename='ner', value=tag)

        pairs = sorted(set([(a,b) for a,b in zip(meddra,header)]))
        for tag,h in pairs:
            if tag != 'O':
                assert h != ''
                token.add_label(typename='meddra', value=h+'-'+tag)
            else:
                token.add_label(typename='meddra', value=tag)
        # v2
        # for n,tag,h in zip(ner,meddra,header):
        #     if tag != 'O':
        #         token.add_label(typename=n, value=h+'-'+tag if h != '' else tag)

        span._add_token(token)
        
    for tok in span:
        tok.sentence = span
        
    docs.append(span)
    
with open(data_save_dir+'/cadec_flair_objects.pkl', 'wb') as f:
    pickle.dump(docs, f)
    
docs

  0%|          | 0/1250 [00:00<?, ?it/s]



  4%|▎         | 44/1250 [00:00<00:02, 433.04it/s]



  8%|▊         | 102/1250 [00:00<00:02, 515.29it/s]



 12%|█▏        | 155/1250 [00:00<00:02, 518.22it/s]



 17%|█▋        | 207/1250 [00:00<00:02, 432.16it/s]



 20%|██        | 252/1250 [00:00<00:02, 377.06it/s]



 23%|██▎       | 292/1250 [00:00<00:02, 369.48it/s]



 27%|██▋       | 333/1250 [00:00<00:02, 379.13it/s]



 30%|███       | 377/1250 [00:00<00:02, 395.77it/s]



 33%|███▎      | 418/1250 [00:01<00:02, 368.51it/s]



 37%|███▋      | 458/1250 [00:01<00:02, 376.19it/s]



 41%|████      | 507/1250 [00:01<00:01, 407.01it/s]



 44%|████▍     | 554/1250 [00:01<00:01, 424.35it/s]



 48%|████▊     | 598/1250 [00:01<00:01, 405.20it/s]



 51%|█████     | 640/1250 [00:01<00:01, 398.62it/s]



 54%|█████▍    | 681/1250 [00:01<00:01, 370.67it/s]



 58%|█████▊    | 719/1250 [00:01<00:01, 367.87it/s]



 61%|██████▏   | 766/1250 [00:01<00:01, 395.64it/s]



 65%|██████▍   | 809/1250 [00:02<00:01, 401.66it/s]



 68%|██████▊   | 850/1250 [00:02<00:01, 358.37it/s]



 71%|███████   | 888/1250 [00:02<00:01, 355.44it/s]



 74%|███████▍  | 925/1250 [00:02<00:00, 346.56it/s]



 77%|███████▋  | 964/1250 [00:02<00:00, 358.26it/s]



 80%|████████  | 1003/1250 [00:02<00:00, 364.96it/s]



 83%|████████▎ | 1040/1250 [00:03<00:02, 89.32it/s] 



 86%|████████▋ | 1080/1250 [00:03<00:01, 117.22it/s]



 90%|████████▉ | 1122/1250 [00:03<00:00, 151.42it/s]



 94%|█████████▍| 1176/1250 [00:04<00:00, 205.23it/s]



 95%|█████████▍| 1186/1250 [00:04<00:00, 288.56it/s]


[Sentence[117]: "i feel a bit drowsy & have a little blurred vision , so far no gastric problems . ive been on arthrotec 50 for over 10 years on and off , only taking it when i needed it . due to my arthritis getting progressively worse , to the point where i am in tears with the agony , gp ' s started me on 75 twice a day and i have to take it . every day for the next month to see how i get on , here goes . so far its been very good , pains almost gone , but i feel a bit weird , didnt have that when on 50 ." → ["i"/O/O, "feel"/O/O, "a"/O/O, "bit"/B-ADR/B-Drowsy, "drowsy"/E-ADR/E-Drowsy, "&"/O/O, "have"/O/O, "a"/O/O, "little"/B-ADR/B-Blurred_vision_-_hazy, "blurred"/I-ADR/I-Blurred_vision_-_hazy, "vision"/E-ADR/E-Blurred_vision_-_hazy, ","/O/O, "so"/O/O, "far"/O/O, "no"/O/O, "gastric"/B-ADR/B-Excessive_upper_gastrointestinal_gas, "problems"/E-ADR/E-Excessive_upper_gastrointestinal_gas, "."/O/O, "ive"/O/O, "been"/O/O, "on"/O/O, "arthrotec"/B-Drug/B-Arthrotec, "50"/O/O, "for"/O/O, "over"

In [164]:
for doc in docs[:10]:
    print(doc)
    for token in doc:
        print(token.text, token.get_labels())
        # print(token.text, token.get_labels('ADR'))

Sentence[117]: "i feel a bit drowsy & have a little blurred vision , so far no gastric problems . ive been on arthrotec 50 for over 10 years on and off , only taking it when i needed it . due to my arthritis getting progressively worse , to the point where i am in tears with the agony , gp ' s started me on 75 twice a day and i have to take it . every day for the next month to see how i get on , here goes . so far its been very good , pains almost gone , but i feel a bit weird , didnt have that when on 50 ." → ["i"/O/O, "feel"/O/O, "a"/O/O, "bit"/B-ADR/B-Drowsy, "drowsy"/E-ADR/E-Drowsy, "&"/O/O, "have"/O/O, "a"/O/O, "little"/B-ADR/B-Blurred_vision_-_hazy, "blurred"/I-ADR/I-Blurred_vision_-_hazy, "vision"/E-ADR/E-Blurred_vision_-_hazy, ","/O/O, "so"/O/O, "far"/O/O, "no"/O/O, "gastric"/B-ADR/B-Excessive_upper_gastrointestinal_gas, "problems"/E-ADR/E-Excessive_upper_gastrointestinal_gas, "."/O/O, "ive"/O/O, "been"/O/O, "on"/O/O, "arthrotec"/B-Drug/B-Arthrotec, "50"/O/O, "for"/O/O, "over"/