<a href="https://colab.research.google.com/github/Teasotea/Token-Classification-for-Bio-articles/blob/main/IOB_converter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [None]:
import nltk
from nltk.tokenize import TreebankWordTokenizer as twt
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load Data

In [None]:
with open('/content/data0.txt') as f:
      eval_article = f.read()

In [None]:
train_df = pd.read_csv('/content/cdr_dner_train_df.csv')
test_df = pd.read_csv('/content/cdr_dner_test_df.csv')
dev_df = pd.read_csv('/content/cdr_dner_dev_df.csv')


In [None]:
CID_train_df = train_df[train_df['xloc'] == "CID"].reset_index(drop=True)
DNER_train_df = train_df[train_df['xloc'] != "CID"].reset_index(drop=True)

CID_test_df = test_df[test_df['xloc'] == "CID"].reset_index(drop=True)
DNER_test_df = test_df[test_df['xloc'] != "CID"].reset_index(drop=True)

CID_dev_df = dev_df[dev_df['xloc'] == "CID"].reset_index(drop=True)
DNER_dev_df = dev_df[dev_df['xloc'] != "CID"].reset_index(drop=True)

In [None]:
DNER_train_df.head()

Unnamed: 0,text_id,xloc,yloc,name,entity,name_id,title_source_text,source_text
0,227508,0,8,Naloxone,Chemical,D009270,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ..."
1,227508,49,58,clonidine,Chemical,D003000,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ..."
2,227508,93,105,hypertensive,Disease,D006973,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ..."
3,227508,181,190,clonidine,Chemical,D003000,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ..."
4,227508,244,252,nalozone,Chemical,-1,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ..."


# Add iob-tags to dataset

In [None]:
DNER_train_df["text"] = DNER_train_df.apply(lambda x: x['title_source_text']+ ' '+x['source_text'], axis = 1)
DNER_train_df["loc"] = DNER_train_df.apply(lambda x: tuple((x['xloc'],x['yloc'])), axis = 1)

DNER_test_df['text'] = DNER_test_df.apply(lambda x: x['title_source_text']+ ' '+x['source_text'], axis = 1)
DNER_test_df["loc"] = DNER_test_df.apply(lambda x: tuple((x['xloc'],x['yloc'])), axis = 1)

DNER_dev_df['text'] = DNER_dev_df.apply(lambda x: x['title_source_text']+ ' '+x['source_text'], axis = 1)
DNER_dev_df["loc"] = DNER_dev_df.apply(lambda x: tuple((x['xloc'],x['yloc'])), axis = 1)

In [None]:
train_data_text_unp = pd.DataFrame(DNER_train_df["text"].unique(), columns=["sent"])
train_data_text_unp['tokens'] = train_data_text_unp.sent.apply(lambda x: nltk.word_tokenize(x))
train_data_text_unp['span_list'] = train_data_text_unp.sent.apply(lambda x: list(twt().span_tokenize(x)))

test_data_text_unp = pd.DataFrame(DNER_test_df["text"].unique(), columns=["sent"])
test_data_text_unp['tokens'] = test_data_text_unp.sent.apply(lambda x: nltk.word_tokenize(x))
test_data_text_unp['span_list'] = test_data_text_unp.sent.apply(lambda x: list(twt().span_tokenize(x)))

dev_data_text_unp = pd.DataFrame(DNER_dev_df["text"].unique(), columns=["sent"])
dev_data_text_unp['tokens'] = dev_data_text_unp.sent.apply(lambda x: nltk.word_tokenize(x))
dev_data_text_unp['span_list'] = dev_data_text_unp.sent.apply(lambda x: list(twt().span_tokenize(x)))

In [None]:
def get_items( source_df, item="entity"):
  items_by_text = []
  for i in source_df["text"].unique():
    items_list = source_df[source_df["text"] == i][item].tolist()
    items_by_text.append(items_list)
  return items_by_text

In [None]:
def extend_df(df, source_d = DNER_train_df):
  df['entities'] = get_items(source_d,item="entity")
  df['names'] = get_items(source_d,item="name")
  df['loc'] = get_items(source_d,item="loc")
  df['name_ids'] = get_items(source_d,item="name_id")
  return df

In [None]:
train_data_text_unp = extend_df(train_data_text_unp)
train_data_text_unp["ent_loc"] = train_data_text_unp.apply(lambda x: dict(zip(x['loc'], x['entities'])), axis=1)

test_data_text_unp = extend_df(test_data_text_unp)
test_data_text_unp["ent_loc"] = test_data_text_unp.apply(lambda x: dict(zip(x['loc'], x['entities'])), axis=1)

dev_data_text_unp = extend_df(dev_data_text_unp)
dev_data_text_unp["ent_loc"] = dev_data_text_unp.apply(lambda x: dict(zip(x['loc'], x['entities'])), axis=1)

In [None]:
def get_iob(span_list, ent_loc):
    iob_list = []
    for start_sp, end_sp in span_list:
        iob_tag = 'O'
        for start, end in list(ent_loc.keys()):
            if start.isnumeric() and end.isnumeric():
              if int(start) == start_sp:
                iob_tag = 'B' + '-' + ent_loc[(start, end)]
                break
              elif int(start) < start_sp and end_sp <= int(end):
                iob_tag = 'I' + '-' + ent_loc[(start, end)]
                break
        iob_list.append(iob_tag)
    return iob_list

In [None]:
train_data_text_unp["iob_tags"] = train_data_text_unp.apply(lambda x: get_iob(x['span_list'], x['ent_loc']), axis=1)
iob_train = train_data_text_unp[['tokens', 'iob_tags']]

test_data_text_unp["iob_tags"] = test_data_text_unp.apply(lambda x: get_iob(x['span_list'], x['ent_loc']), axis=1)
iob_test = test_data_text_unp[['tokens', 'iob_tags']]

dev_data_text_unp["iob_tags"] = dev_data_text_unp.apply(lambda x: get_iob(x['span_list'], x['ent_loc']), axis=1)
iob_dev = dev_data_text_unp[['tokens', 'iob_tags']]

In [None]:
iob_train.to_csv('iob_train.csv')
iob_test.to_csv('iob_test.csv')
iob_dev.to_csv('iob_dev.csv')

In [None]:
iob_dev

Unnamed: 0,tokens,iob_tags
0,"[Tricuspid, valve, regurgitation, and, lithium...","[B-Chemical, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[Phenobarbital-induced, dyskinesia, in, a, neu...","[B-Chemical, I-Disease, O, O, O, O, O, O, O, O..."
2,"[Acute, changes, of, blood, ammonia, may, pred...","[B-Chemical, I-Chemical, O, O, O, O, O, I-Dise..."
3,"[Effects, of, calcitonin, on, rat, extrapyrami...","[B-Chemical, I-Chemical, I-Chemical, I-Chemica..."
4,"[Development, of, isoproterenol-induced, cardi...","[O, O, O, O, O, I-Chemical, O, I-Disease, I-Di..."
...,...,...
495,"[Reversible, cerebral, lesions, associated, wi...","[B-Disease, I-Disease, O, O, I-Chemical, O, I-..."
496,"[Antagonism, of, diazepam-induced, sedative, e...","[O, O, O, O, O, O, O, I-Disease, O, O, O, O, O..."
497,"[Enhanced, stimulus-induced, neurotransmitter,...","[B-Chemical, O, O, O, O, O, O, O, O, O, O, O, ..."
498,"[Ocular, manifestations, of, juvenile, rheumat...","[O, O, O, O, O, B-Chemical, O, O, O, O, O, O, ..."
