In [190]:
# imports

# data parsing and processing
import pandas as pd
import numpy as np
import copy
import os
import glob
import random
import re
import xml.etree.ElementTree as ET
import json

# source paths
DATA_PATH = "../data/"

In [163]:
def parse_ddi_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    data = []

    filename = os.path.splitext(os.path.basename(file_path))[0]
    
    for sentence in root.findall('.//sentence'):
        text = sentence.attrib['text']
        text_id = sentence.attrib['id']
        pairs = []
        
        for pair in sentence.findall('.//pair'):
            e1_id = pair.attrib['e1']
            e2_id = pair.attrib['e2']
            ddi = pair.attrib['ddi'] == 'true'
            ddi_type = pair.attrib.get('type', 'unknown')
            pairs.append((e1_id, e2_id, ddi, ddi_type))
        
        entities = {}
        for entity in sentence.findall('.//entity'):
            char_offset_str = entity.attrib['charOffset']
            entity_spans = char_offset_str.split(';')
            spans = []
            for span in entity_spans:
                start, end = map(int, span.split('-'))
                spans.append((start, end))
            if spans:
                entities[entity.attrib['id']] = {
                    'text': entity.attrib['text'],
                    'type': entity.attrib['type'],
                    'char_offset': spans
                }
            else:
                entities[entity.attrib['id']] = {
                    'text': entity.attrib['text'],
                    'type': entity.attrib['type'],
                    'char_offset': None
                }
        
        for e1_id, e2_id, ddi, ddi_type in pairs:
            data.append({
                'filename': filename,
                'sentence': text,
                'sent_id': text_id,
                'entity1': {
                    'text': entities[e1_id]['text'],
                    'type': entities[e1_id]['type'],
                    'char_offset': entities[e1_id]['char_offset']
                },
                'entity2': {
                    'text': entities[e2_id]['text'],
                    'type': entities[e2_id]['type'],
                    'char_offset': entities[e2_id]['char_offset']
                },
                'ddi': ddi,
                'type': ddi_type,
                'all_ents': entities
            })
    
    return data

def preprocess_data(*corpus_paths, parse_function):
    """
    Preprocess corpora of XML files for entity extraction with provided paths and parsing function.
    Returns a list of sentences with entity/relationship pairs
    """
    data = []
    for corpus_path in corpus_paths:
        for file in os.listdir(corpus_path):
            if file.endswith('.xml'):
                file_path = os.path.join(corpus_path, file)
                data.extend(parse_function(file_path))
    
    return data

def extract_ner_data(input_data):
    ner_data = []
    data = copy.deepcopy(input_data)
    # Iterate through each instance in the data
    for instance in data:
        input_data = {}
        input_data['sentence'] = instance['sentence']
        input_data['entities'] = []

        # Check if there is a DDI (drug-drug interaction) in the instance
        if instance['ddi']:
            char_offset1 = instance['entity1']['char_offset']
            char_offset2 = instance['entity2']['char_offset']
        else:
            # Set dummy values if there is no DDI
            char_offset1, char_offset2 = -1, -1

        # Iterate through all entities in the instance
        for value in instance['all_ents'].values():
            # Check if the entity has a relation (based on char_offset)
            if (value['char_offset'] == char_offset1) or (value['char_offset'] == char_offset2):
                # print(value['char_offset'], char_offset1, char_offset2)
                # Update the entity type to include the relation type
                relation_type = value['type']
#                 value['type'] = f"{value['type']}-{instance['type']}"
                relation_entity = value.copy()
                relation_entity['type'] = relation_type + f"-{instance['type']}"
                input_data['entities'].append(relation_entity)
            else:
            # Append the entity to the input_data
                input_data['entities'].append(value)

        # Append the input_data to the ner_data
        ner_data.append(input_data)
    
    return ner_data

def extract_ddi_data(data):
    ddi_data = []
    for instance in data:
        input_data = {}
        input_data['relations'] = {}
        
        input_data['sentence'] = instance['sentence']
        input_data['relations']['entity1'] = instance['entity1']
        input_data['relations']['entity2'] = instance['entity2']
        input_data['relations']['ddi'] = instance['ddi']
        input_data['relations']['type'] = instance['type']
        
        ddi_data.append(input_data)
    return ddi_data

def create_train_val_split(data, val_split=0.1):
    """
    shuffles and splits the data 
    """
    random.shuffle(data)
    val_size = int(len(data) * val_split)
    train_data = data[:-val_size]
    val_data = data[-val_size:]
    return train_data, val_data

# def generate_conll_format(data):
#     # Initialize an empty list to store token labels
#     sentence = data['sentence']
#     entities = data['entities']
#     #relations = data['relationships']
    
#     labels = ['O'] * len(sentence)

#     # Assign labels using character offsets
#     for entity in entities:
#         span = entity['char_offset']
#         for start, end in span:
#             if start < len(sentence) and end <= len(sentence):
#                 labels[start] = f"B-{entity['type']}"
#                 for i in range(start + 1, end):
#                     labels[i] = f"I-{entity['type']}"

#     # Tokenize the sentence using regex to split on whitespace or special characters
#     pattern = r"(\w+|\S)"
#     tokens = [match.group() for match in re.finditer(pattern, sentence)]

#     # Combine tokens and labels into the CoNLL format
#     conll_format = {}
#     conll_format['tokens'] = []
#     conll_format['tags'] = []
#     token_start = 0
#     for token in tokens:
#         token_start = sentence.find(token, token_start)
#         token_end = token_start + len(token) - 1
#         token_label = labels[token_start:token_end+1]
        
#         # Find the main label by checking if there are any "B-" or "I-" labels in the token_label list
#         main_label = next((label for label in token_label if label.startswith("B-") or label.startswith("I-")), 'O')
        
#         #conll_format += f"{token} {main_label}\n"
#         conll_format['tokens'].append(token)
#         conll_format['tags'].append(main_label)
#         token_start = token_end + 1
#     #conll_format['relations'] = relations
#     return conll_format

def generate_conll_format(data, return_labels=False):
    sentence = data['sentence']
    entities = data['entities']
    
    labels = ['O'] * len(sentence)

    # Assign labels using character offsets
    for entity in entities:
        span = entity['char_offset']
        for start, end in span:
            if start < len(sentence) and end <= len(sentence):
                labels[start] = f"B-{entity['type']}"
                for i in range(start + 1, end + 1):
                    labels[i] = f"I-{entity['type']}"
    
    # Tokenize the sentence using regex to split on whitespace or special characters
    pattern = r"(\w+|\S)"
    tokens = [match.group() for match in re.finditer(pattern, sentence)]

    # Combine tokens and labels into the CoNLL format
    conll_format = {}
    conll_format['tokens'] = []
    conll_format['tags'] = []
    token_start = 0
    for token in tokens:
        token_start = sentence.find(token, token_start)
        token_end = token_start + len(token) - 1
        token_label = labels[token_start:token_end + 1]
        
        # Find the main label by checking if there are any "B-" or "I-" labels in the token_label list
        main_label = next((label for label in token_label if label.startswith("B-") or label.startswith("I-")), 'O')
        
        conll_format['tokens'].append(token)
        conll_format['tags'].append(main_label)
        token_start = token_end + 1
    
    if return_labels:
        return conll_format, labels
    else:
        return conll_format

def save_data(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)

# DDI

In [164]:
CORPORA_PATH = "raw/ddi-corpus/APIforDDICorpus/DDICorpus/"

# Preprocess training data
train_folder_drugbank = DATA_PATH + CORPORA_PATH + "Train/DrugBank/"
train_folder_medline = DATA_PATH + CORPORA_PATH + "Train/Medline/"
train_data = preprocess_data(train_folder_drugbank, train_folder_medline, parse_function = parse_ddi_xml)

# # Preprocess test data
test_folder_drugbank = DATA_PATH + CORPORA_PATH + "Test/Test for DrugNER task/DrugBank/"
test_folder_medline = DATA_PATH + CORPORA_PATH + "Test/Test for DrugNER task/MedLine/"

# Preprocess test data
test_data = preprocess_data(test_folder_drugbank, test_folder_medline, parse_function = parse_ddi_xml)
# # Split the data into training and validation sets (80% training, 20% validation)
train_data, val_data = create_train_val_split(train_data)
len(train_data), len(val_data), len(test_data)

(25013, 2779, 941)

In [165]:
# find some samples to check functions
ddi_samples = []
multi_span_samples = []
for i, data in enumerate(train_data):
    if data['ddi']:
        ddi_samples.append(i)
    for ent in data['all_ents'].values():
        if len(ent['char_offset']) > 1:
            multi_span_samples.append(i)
            
len(ddi_samples), len(multi_span_samples)

(3614, 388)

In [166]:
conll_sents_train[multi_span_samples[0]]

{'tokens': ['Allopurinol',
  ':',
  'The',
  'AUC',
  'of',
  'didanosine',
  'was',
  'increased',
  'about',
  '4',
  '-',
  'fold',
  'when',
  'allopurinol',
  'at',
  '300',
  'mg',
  '/',
  'day',
  'was',
  'coadministered',
  'with',
  'a',
  'single',
  '200',
  '-',
  'mg',
  'dose',
  'of',
  'VIDEX',
  'to',
  'two',
  'patients',
  'with',
  'renal',
  'impairment',
  '(',
  'CLcr',
  '=',
  '15',
  'and',
  '18',
  'mL',
  '/',
  'min',
  ')',
  '.'],
 'tags': ['B-drug',
  'O',
  'O',
  'O',
  'O',
  'B-drug',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-drug',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-brand',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']}

In [167]:
extract_ner_data([train_data[ddi_samples[1]]])[0]

{'sentence': 'When other antiplatelet agents or anticoagulants are used concomitantly, there is the potential for FLOLAN to increase the risk of bleeding.',
 'entities': [{'text': 'antiplatelet agents',
   'type': 'group',
   'char_offset': [(11, 29)]},
  {'text': 'anticoagulants',
   'type': 'group-effect',
   'char_offset': [(34, 47)]},
  {'text': 'FLOLAN', 'type': 'brand-effect', 'char_offset': [(100, 105)]}]}

In [168]:
train_data[multi_span_samples[5]]

{'filename': 'Dofetilide_ddi',
 'sentence': 'If a patient requires TIKOSYN and anti-ulcer therapy, it is suggested that omeprazole, ranitidine, or antacids (aluminum and magnesium hydroxides) be used as alternatives to cimetidine, as these agents have no effect on the pharmacokinetic profile of TIKOSYN.',
 'sent_id': 'DDI-DrugBank.d558.s5',
 'entity1': {'text': 'aluminum hydroxide',
  'type': 'drug',
  'char_offset': [(112, 119), (135, 143)]},
 'entity2': {'text': 'magnesium hydroxide',
  'type': 'drug',
  'char_offset': [(125, 143)]},
 'ddi': False,
 'type': 'unknown',
 'all_ents': {'DDI-DrugBank.d558.s5.e0': {'text': 'TIKOSYN',
   'type': 'brand',
   'char_offset': [(22, 28)]},
  'DDI-DrugBank.d558.s5.e1': {'text': 'anti-ulcer',
   'type': 'group',
   'char_offset': [(34, 43)]},
  'DDI-DrugBank.d558.s5.e2': {'text': 'omeprazole',
   'type': 'drug',
   'char_offset': [(75, 84)]},
  'DDI-DrugBank.d558.s5.e3': {'text': 'ranitidine',
   'type': 'drug',
   'char_offset': [(87, 96)]},
  'D

In [169]:
extract_ner_data([train_data[multi_span_samples[5]]])

[{'sentence': 'If a patient requires TIKOSYN and anti-ulcer therapy, it is suggested that omeprazole, ranitidine, or antacids (aluminum and magnesium hydroxides) be used as alternatives to cimetidine, as these agents have no effect on the pharmacokinetic profile of TIKOSYN.',
  'entities': [{'text': 'TIKOSYN', 'type': 'brand', 'char_offset': [(22, 28)]},
   {'text': 'anti-ulcer', 'type': 'group', 'char_offset': [(34, 43)]},
   {'text': 'omeprazole', 'type': 'drug', 'char_offset': [(75, 84)]},
   {'text': 'ranitidine', 'type': 'drug', 'char_offset': [(87, 96)]},
   {'text': 'antacids', 'type': 'group', 'char_offset': [(102, 109)]},
   {'text': 'aluminum hydroxide',
    'type': 'drug',
    'char_offset': [(112, 119), (135, 143)]},
   {'text': 'magnesium hydroxide',
    'type': 'drug',
    'char_offset': [(125, 143)]},
   {'text': 'cimetidine', 'type': 'drug', 'char_offset': [(174, 183)]},
   {'text': 'TIKOSYN', 'type': 'brand', 'char_offset': [(251, 257)]}]}]

In [170]:
conll, labels = generate_conll_format(extract_ner_data([train_data[multi_span_samples[5]]])[0], return_labels=True)

In [171]:
chars = zip([char for char in train_data[multi_span_samples[5]]['sentence']], labels)

In [174]:
conll

{'tokens': ['If',
  'a',
  'patient',
  'requires',
  'TIKOSYN',
  'and',
  'anti',
  '-',
  'ulcer',
  'therapy',
  ',',
  'it',
  'is',
  'suggested',
  'that',
  'omeprazole',
  ',',
  'ranitidine',
  ',',
  'or',
  'antacids',
  '(',
  'aluminum',
  'and',
  'magnesium',
  'hydroxides',
  ')',
  'be',
  'used',
  'as',
  'alternatives',
  'to',
  'cimetidine',
  ',',
  'as',
  'these',
  'agents',
  'have',
  'no',
  'effect',
  'on',
  'the',
  'pharmacokinetic',
  'profile',
  'of',
  'TIKOSYN',
  '.'],
 'tags': ['O',
  'O',
  'O',
  'O',
  'B-brand',
  'O',
  'B-group',
  'I-group',
  'I-group',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-drug',
  'O',
  'B-drug',
  'O',
  'O',
  'B-group',
  'O',
  'B-drug',
  'O',
  'B-drug',
  'I-drug',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-drug',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-brand',
  'O']}

In [173]:
for char, label in chars:
    print(char, label)

I O
f O
  O
a O
  O
p O
a O
t O
i O
e O
n O
t O
  O
r O
e O
q O
u O
i O
r O
e O
s O
  O
T B-brand
I I-brand
K I-brand
O I-brand
S I-brand
Y I-brand
N I-brand
  O
a O
n O
d O
  O
a B-group
n I-group
t I-group
i I-group
- I-group
u I-group
l I-group
c I-group
e I-group
r I-group
  O
t O
h O
e O
r O
a O
p O
y O
, O
  O
i O
t O
  O
i O
s O
  O
s O
u O
g O
g O
e O
s O
t O
e O
d O
  O
t O
h O
a O
t O
  O
o B-drug
m I-drug
e I-drug
p I-drug
r I-drug
a I-drug
z I-drug
o I-drug
l I-drug
e I-drug
, O
  O
r B-drug
a I-drug
n I-drug
i I-drug
t I-drug
i I-drug
d I-drug
i I-drug
n I-drug
e I-drug
, O
  O
o O
r O
  O
a B-group
n I-group
t I-group
a I-group
c I-group
i I-group
d I-group
s I-group
  O
( O
a B-drug
l I-drug
u I-drug
m I-drug
i I-drug
n I-drug
u I-drug
m I-drug
  O
a O
n O
d O
  O
m B-drug
a I-drug
g I-drug
n I-drug
e I-drug
s I-drug
i I-drug
u I-drug
m I-drug
  I-drug
h I-drug
y I-drug
d I-drug
r I-drug
o I-drug
x I-drug
i I-drug
d I-drug
e I-drug
s O
) O
  O
b O
e O
  O
u O
s O
e O
d O

In [175]:
# Get NER Entities and put them in CoNLL format
conll_sents_train = []
ner_data_train = extract_ner_data(train_data)
for data in ner_data_train:
    conll_sents_train.append(generate_conll_format(data))
    
conll_sents_dev = []
ner_data_dev = extract_ner_data(val_data)
for data in ner_data_dev:
    conll_sents_dev.append(generate_conll_format(data))
    
conll_sents_test = []
ner_data_test = extract_ner_data(test_data)
for data in ner_data_test:
    conll_sents_test.append(generate_conll_format(data))

In [178]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer, max_length=70):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword.
    """

    tokenized_sentence = []
    labels = []
    for word, label in zip(sentence, text_labels):

        # tokenize word and count # of subword tokens
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # add tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # add label and multiply by subword length
        labels.extend([label] * n_subwords)

    tokenized_sentence += [tokenizer.pad_token] * (max_length - len(tokenized_sentence))
    labels += ['O'] * (max_length - len(labels))
    #tokenized_sentence.extend(tokenizer.pad_token * (max_length - len(tokenized_sentence)))
    #print(tokenized_sentence)
    
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)
    attention_mask = [1 if token != tokenizer.pad_token else 0 for token in tokenized_sentence]
    #return input_ids, attention_mask, labels
    return tokenized_sentence, labels

In [179]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [181]:
sample = conll_sents_train[multi_span_samples[5]]

In [184]:
tokens_tags = zip(sample['tokens'], sample['tags'])

In [185]:
tokenized_tokens, tokenized_tags = tokenize_and_preserve_labels(sample['tokens'], sample['tags'], tokenizer=tokenizer)

In [186]:
tokenized_tokens_tags = zip(tokenized_tokens, tokenized_tags)

In [187]:
for token, tag in tokens_tags:
    print(token, tag)

If O
a O
patient O
requires O
TIKOSYN B-brand
and O
anti B-group
- I-group
ulcer I-group
therapy O
, O
it O
is O
suggested O
that O
omeprazole B-drug
, O
ranitidine B-drug
, O
or O
antacids B-group
( O
aluminum B-drug
and O
magnesium B-drug
hydroxides I-drug
) O
be O
used O
as O
alternatives O
to O
cimetidine B-drug
, O
as O
these O
agents O
have O
no O
effect O
on O
the O
pharmacokinetic O
profile O
of O
TIKOSYN B-brand
. O


In [189]:
for tokenized_token, tokenized_tag in tokenized_tokens_tags:
    print(tokenized_token, tokenized_tag)

if O
a O
patient O
requires O
ti B-brand
##kos B-brand
##yn B-brand
and O
anti B-group
- I-group
ul I-group
##cer I-group
therapy O
, O
it O
is O
suggested O
that O
om B-drug
##ep B-drug
##raz B-drug
##ole B-drug
, O
rani B-drug
##ti B-drug
##dine B-drug
, O
or O
ant B-group
##ac B-group
##ids B-group
( O
aluminum B-drug
and O
magnesium B-drug
hydro I-drug
##xide I-drug
##s I-drug
) O
be O
used O
as O
alternatives O
to O
ci B-drug
##met B-drug
##idi B-drug
##ne B-drug
, O
as O
these O
agents O
have O
no O
effect O
on O
the O
ph O
##arm O
##aco O
##kin O
##etic O
profile O
of O
ti B-brand
##kos B-brand
##yn B-brand
. O
[PAD] O
[PAD] O
