<a href="https://colab.research.google.com/github/Teasotea/BioNER-and-RD/blob/main/IOB_converter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
import nltk
from nltk.tokenize import TreebankWordTokenizer as twt
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Load Data

In [3]:
with open('/content/data0.txt') as f:
      eval_article = f.read()

In [4]:
train_df = pd.read_csv('/content/cdr_dner_train_df.csv')
test_df = pd.read_csv('/content/cdr_dner_test_df.csv')
dev_df = pd.read_csv('/content/cdr_dner_dev_df.csv')


In [5]:
CID_train_df = train_df[train_df['xloc'] == "CID"].reset_index(drop=True)
DNER_train_df = train_df[train_df['xloc'] != "CID"].reset_index(drop=True)

CID_test_df = test_df[test_df['xloc'] == "CID"].reset_index(drop=True)
DNER_test_df = test_df[test_df['xloc'] != "CID"].reset_index(drop=True)

CID_dev_df = dev_df[dev_df['xloc'] == "CID"].reset_index(drop=True)
DNER_dev_df = dev_df[dev_df['xloc'] != "CID"].reset_index(drop=True)

# Add iob-tags to dataset

In [6]:
DNER_train_df = DNER_train_df[DNER_train_df['xloc'].str.isnumeric() & DNER_train_df['yloc'].str.isnumeric()]
DNER_test_df = DNER_test_df[DNER_test_df['xloc'].str.isnumeric() & DNER_test_df['yloc'].str.isnumeric()]
DNER_dev_df = DNER_dev_df[DNER_dev_df['xloc'].str.isnumeric() & DNER_dev_df['yloc'].str.isnumeric()]

DNER_train_df = DNER_train_df.astype({"xloc": int, "yloc": int}, errors='raise')
DNER_test_df = DNER_test_df.astype({"xloc": int, "yloc": int}, errors='raise')
DNER_dev_df = DNER_dev_df.astype({"xloc": int, "yloc": int}, errors='raise')

In [7]:
DNER_train_df["text"] = DNER_train_df.apply(lambda x: x['title_source_text']+ ' '+x['source_text'], axis = 1)
DNER_train_df["loc"] = DNER_train_df.apply(lambda x: tuple((x['xloc'],x['yloc'])), axis = 1)

DNER_test_df['text'] = DNER_test_df.apply(lambda x: x['title_source_text']+ ' '+x['source_text'], axis = 1)
DNER_test_df["loc"] = DNER_test_df.apply(lambda x: tuple((x['xloc'],x['yloc'])), axis = 1)

DNER_dev_df['text'] = DNER_dev_df.apply(lambda x: x['title_source_text']+ ' '+x['source_text'], axis = 1)
DNER_dev_df["loc"] = DNER_dev_df.apply(lambda x: tuple((x['xloc'],x['yloc'])), axis = 1)

In [8]:
train_data_text_unp = pd.DataFrame(DNER_train_df["text"].unique(), columns=["sent"])
train_data_text_unp['tokens'] = train_data_text_unp.sent.apply(lambda x: list(twt().tokenize(x)))

test_data_text_unp = pd.DataFrame(DNER_test_df["text"].unique(), columns=["sent"])
test_data_text_unp['tokens'] = test_data_text_unp.sent.apply(lambda x: list(twt().tokenize(x)))

dev_data_text_unp = pd.DataFrame(DNER_dev_df["text"].unique(), columns=["sent"])
dev_data_text_unp['tokens'] = dev_data_text_unp.sent.apply(lambda x: list(twt().tokenize(x)))

In [9]:
def get_items( source_df, item="entity"):
  items_by_text = []
  for i in source_df["text"].unique():
    items_list = source_df[source_df["text"] == i][item].tolist()
    items_by_text.append(items_list)
  return items_by_text

In [10]:
def extend_df(df, source_d = DNER_train_df):
  df['entities'] = get_items(source_d,item="entity")
  df['names'] = get_items(source_d,item="name")
  df['loc'] = get_items(source_d,item="loc")
  df['name_ids'] = get_items(source_d,item="name_id")
  return df

In [11]:
train_data_text_unp = extend_df(train_data_text_unp, source_d = DNER_train_df)
train_data_text_unp["ent_loc"] = train_data_text_unp.apply(lambda x: dict(zip(x['loc'], x['entities'])), axis=1)

test_data_text_unp = extend_df(test_data_text_unp, source_d = DNER_test_df)
test_data_text_unp["ent_loc"] = test_data_text_unp.apply(lambda x: dict(zip(x['loc'], x['entities'])), axis=1)

dev_data_text_unp = extend_df(dev_data_text_unp, source_d = DNER_dev_df)
dev_data_text_unp["ent_loc"] = dev_data_text_unp.apply(lambda x: dict(zip(x['loc'], x['entities'])), axis=1)

In [12]:
for i in train_data_text_unp.iloc[1]:
  print(len(i), i)

419 Lidocaine-induced cardiac asystole. Intravenous administration of a single 50-mg bolus of lidocaine in a 67-year-old man resulted in profound depression of the activity of the sinoatrial and atrioventricular nodal pacemakers. The patient had no apparent associated conditions which might have predisposed him to the development of bradyarrhythmias; and, thus, this probably represented a true idiosyncrasy to lidocaine.
61 ['Lidocaine-induced', 'cardiac', 'asystole.', 'Intravenous', 'administration', 'of', 'a', 'single', '50-mg', 'bolus', 'of', 'lidocaine', 'in', 'a', '67-year-old', 'man', 'resulted', 'in', 'profound', 'depression', 'of', 'the', 'activity', 'of', 'the', 'sinoatrial', 'and', 'atrioventricular', 'nodal', 'pacemakers.', 'The', 'patient', 'had', 'no', 'apparent', 'associated', 'conditions', 'which', 'might', 'have', 'predisposed', 'him', 'to', 'the', 'development', 'of', 'bradyarrhythmias', ';', 'and', ',', 'thus', ',', 'this', 'probably', 'represented', 'a', 'true', 'idio

In [13]:
def get_iob(text, ent_loc):
    
    span_list = list(twt().span_tokenize(text))
    iob_list = []
    
    for start_sp, end_sp in span_list:
        iob_tag = 'O'
        for start, end in list(ent_loc.keys()):
              if int(start) == start_sp:
                iob_tag = 'B' + '-' + ent_loc[(start, end)]
                break
              elif int(start) < start_sp and end_sp <= int(end):
                iob_tag = 'I' + '-' + ent_loc[(start, end)]
                break
        iob_list.append(iob_tag)
    return iob_list

In [14]:
train_data_text_unp["iob_tags"] = train_data_text_unp.apply(lambda x: get_iob(x['sent'], x['ent_loc']), axis=1)
iob_train = train_data_text_unp[['tokens', 'iob_tags']]

test_data_text_unp["iob_tags"] = test_data_text_unp.apply(lambda x: get_iob(x['sent'], x['ent_loc']), axis=1)
iob_test = test_data_text_unp[['tokens', 'iob_tags']]

dev_data_text_unp["iob_tags"] = dev_data_text_unp.apply(lambda x: get_iob(x['sent'], x['ent_loc']), axis=1)
iob_dev = dev_data_text_unp[['tokens', 'iob_tags']]

In [15]:
for i in train_data_text_unp.iloc[1]:
  print(len(i), i)

419 Lidocaine-induced cardiac asystole. Intravenous administration of a single 50-mg bolus of lidocaine in a 67-year-old man resulted in profound depression of the activity of the sinoatrial and atrioventricular nodal pacemakers. The patient had no apparent associated conditions which might have predisposed him to the development of bradyarrhythmias; and, thus, this probably represented a true idiosyncrasy to lidocaine.
61 ['Lidocaine-induced', 'cardiac', 'asystole.', 'Intravenous', 'administration', 'of', 'a', 'single', '50-mg', 'bolus', 'of', 'lidocaine', 'in', 'a', '67-year-old', 'man', 'resulted', 'in', 'profound', 'depression', 'of', 'the', 'activity', 'of', 'the', 'sinoatrial', 'and', 'atrioventricular', 'nodal', 'pacemakers.', 'The', 'patient', 'had', 'no', 'apparent', 'associated', 'conditions', 'which', 'might', 'have', 'predisposed', 'him', 'to', 'the', 'development', 'of', 'bradyarrhythmias', ';', 'and', ',', 'thus', ',', 'this', 'probably', 'represented', 'a', 'true', 'idio

In [18]:
iob_train.to_csv('iob_train.csv', index=False)
iob_test.to_csv('iob_test.csv', index=False)
iob_dev.to_csv('iob_dev.csv', index=False)

In [17]:
iob_train

Unnamed: 0,tokens,iob_tags
0,"[Naloxone, reverses, the, antihypertensive, ef...","[B-Chemical, O, O, O, O, O, B-Chemical, O, O, ..."
1,"[Lidocaine-induced, cardiac, asystole., Intrav...","[B-Chemical, B-Disease, O, O, O, O, O, O, O, O..."
2,"[Suxamethonium, infusion, rate, and, observed,...","[B-Chemical, O, O, O, O, B-Disease, O, O, O, B..."
3,"[Galanthamine, hydrobromide, ,, a, longer, act...","[B-Chemical, I-Chemical, O, O, O, O, O, O, O, ..."
4,"[Effects, of, uninephrectomy, and, high, prote...","[O, O, O, O, O, O, O, O, B-Chemical, B-Disease..."
...,...,...
484,"[Visual, hallucinations, associated, with, zon...","[B-Disease, I-Disease, O, O, B-Chemical, B-Che..."
485,"[GLEPP1, receptor, tyrosine, phosphatase, (, P...","[O, O, B-Chemical, O, O, O, O, O, O, B-Chemica..."
486,"[Ticlopidine-induced, aplastic, anemia, :, rep...","[B-Chemical, B-Disease, I-Disease, O, O, O, O,..."
487,"[Facilitation, of, memory, retrieval, by, pre-...","[O, O, O, O, O, O, B-Chemical, O, O, O, O, O, ..."
