In [1]:
## imports

# data parsing and processing
import pandas as pd
import numpy as np
import os
import glob
import random
import re
import xml.etree.ElementTree as ET
import json

# source paths
DATA_PATH = "../data/"

In [2]:
def parse_ddi_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    data = []

    filename = os.path.splitext(os.path.basename(file_path))[0]
    
    for sentence in root.findall('.//sentence'):
        text = sentence.attrib['text']
        text_id = sentence.attrib['id']
        pairs = []
        
        for pair in sentence.findall('.//pair'):
            e1_id = pair.attrib['e1']
            e2_id = pair.attrib['e2']
            ddi = pair.attrib['ddi'] == 'true'
            ddi_type = pair.attrib.get('type', 'unknown')
            pairs.append((e1_id, e2_id, ddi, ddi_type))
        
        entities = {}
        for entity in sentence.findall('.//entity'):
            char_offset_str = entity.attrib['charOffset']
            entity_spans = char_offset_str.split(';')
            spans = []
            for span in entity_spans:
                start, end = map(int, span.split('-'))
                spans.append((start, end))
            if spans:
                entities[entity.attrib['id']] = {
                    'text': entity.attrib['text'],
                    'type': entity.attrib['type'],
                    'char_offset': spans
                }
            else:
                entities[entity.attrib['id']] = {
                    'text': entity.attrib['text'],
                    'type': entity.attrib['type'],
                    'char_offset': None
                }
        
        for e1_id, e2_id, ddi, ddi_type in pairs:
            data.append({
                'filename': filename,
                'sentence': text,
                'sent_id': text_id,
                'entity1': {
                    'text': entities[e1_id]['text'],
                    'type': entities[e1_id]['type'],
                    'char_offset': entities[e1_id]['char_offset']
                },
                'entity2': {
                    'text': entities[e2_id]['text'],
                    'type': entities[e2_id]['type'],
                    'char_offset': entities[e2_id]['char_offset']
                },
                'ddi': ddi,
                'type': ddi_type,
                'all_ents': entities
            })
    
    return data

def preprocess_data(*corpus_paths, parse_function):
    """
    Preprocess corpora of XML files for entity extraction with provided paths and parsing function.
    Returns a list of sentences with entity/relationship pairs
    """
    data = []
    for corpus_path in corpus_paths:
        for file in os.listdir(corpus_path):
            if file.endswith('.xml'):
                file_path = os.path.join(corpus_path, file)
                data.extend(parse_function(file_path))
    
    return data

def extract_ner_data(data):
    ner_data = []
    for instance in data:
        input_data = {}
        input_data['sentence'] = instance['sentence']
        input_data['entities'] = []
        for value in instance['all_ents'].values():
            input_data['entities'].append(value)
        ner_data.append(input_data)
    return ner_data

def extract_ddi_data(data):
    ddi_data = []
    for instance in data:
        input_data = {}
        input_data['relations'] = {}
        
        input_data['sentence'] = instance['sentence']
        input_data['relations']['entity1'] = instance['entity1']
        input_data['relations']['entity2'] = instance['entity2']
        input_data['relations']['ddi'] = instance['ddi']
        input_data['relations']['type'] = instance['type']
        
        ddi_data.append(input_data)
    return ddi_data

def create_train_val_split(data, val_split=0.1):
    """
    shuffles and splits the data 
    """
    random.shuffle(data)
    val_size = int(len(data) * val_split)
    train_data = data[:-val_size]
    val_data = data[-val_size:]
    return train_data, val_data

def generate_conll_format(sentence, entities):
    # Initialize an empty list to store token labels
    labels = ['O'] * len(sentence)

    # Assign labels using character offsets
    for entity in entities:
        span = entity['char_offset']
        for start, end in span:
            if start < len(sentence) and end <= len(sentence):
                labels[start] = f"B-{entity['type']}"
                for i in range(start + 1, end):
                    labels[i] = f"I-{entity['type']}"

    # Tokenize the sentence using regex to split on whitespace or special characters
    pattern = r"(\w+|\S)"
    tokens = [match.group() for match in re.finditer(pattern, sentence)]

    # Combine tokens and labels into the CoNLL format
    conll_format = {}
    conll_format['tokens'] = []
    conll_format['tags'] = []
    token_start = 0
    for token in tokens:
        token_start = sentence.find(token, token_start)
        token_end = token_start + len(token) - 1
        token_label = labels[token_start:token_end+1]
        
        # Find the main label by checking if there are any "B-" or "I-" labels in the token_label list
        main_label = next((label for label in token_label if label.startswith("B-") or label.startswith("I-")), 'O')
        
        #conll_format += f"{token} {main_label}\n"
        conll_format['tokens'].append(token)
        conll_format['tags'].append(main_label)
        token_start = token_end + 1

    return conll_format

def save_data(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)

# DDI

In [3]:
CORPORA_PATH = "raw/ddi-corpus/APIforDDICorpus/DDICorpus/"

# Preprocess training data
train_folder_drugbank = DATA_PATH + CORPORA_PATH + "Train/DrugBank/"
train_folder_medline = DATA_PATH + CORPORA_PATH + "Train/Medline/"
train_data = preprocess_data(train_folder_drugbank, train_folder_medline, parse_function = parse_ddi_xml)

# # Preprocess test data
test_folder_drugbank = DATA_PATH + CORPORA_PATH + "Test/Test for DrugNER task/DrugBank/"
test_folder_medline = DATA_PATH + CORPORA_PATH + "Test/Test for DrugNER task/MedLine/"

# Preprocess test data
test_data = preprocess_data(test_folder_drugbank, test_folder_medline, parse_function = parse_ddi_xml)
# # Split the data into training and validation sets (80% training, 20% validation)
train_data, val_data = create_train_val_split(train_data)
len(train_data), len(val_data), len(test_data)

(25013, 2779, 941)

In [5]:
ddi_train_data = extract_ddi_data(train_data)
ddi_train_data[0]

{'relations': {'entity1': {'text': 'antiretroviral nucleoside analogues',
   'type': 'drug',
   'char_offset': [(67, 101)]},
  'entity2': {'text': 'ethionamide',
   'type': 'drug',
   'char_offset': [(153, 163)]},
  'ddi': False,
  'type': 'unknown'},
 'sentence': 'Drugs that have been associated with peripheral neuropathy include antiretroviral nucleoside analogues, chloramphenicol, cisplatin, dapsone, disulfiram, ethionamide, glutethimide, gold, hydralazine, iodoquinol, isoniazid, metronidazole, nitrofurantoin, phenytoin, ribavirin, and vincristine.'}

In [6]:
# Get NER Entities and put them in CoNLL format
conll_sents_train = []
ner_data = extract_ner_data(train_data)
for data in ner_data:
    conll_sents_train.append(generate_conll_format(data['sentence'], data['entities']))
    
conll_sents_dev = []
ner_data = extract_ner_data(val_data)
for data in ner_data:
    conll_sents_dev.append(generate_conll_format(data['sentence'], data['entities']))
    
conll_sents_test = []
ner_data = extract_ner_data(test_data)
for data in ner_data:
    conll_sents_test.append(generate_conll_format(data['sentence'], data['entities']))

In [7]:
SAVE_PATH = DATA_PATH + "preprocessed/ddi-corpus/drug_ner/"
# save the data to files
save_data(conll_sents_train, SAVE_PATH + 'train.json')
save_data(conll_sents_test, SAVE_PATH + 'test.json')
save_data(conll_sents_dev, SAVE_PATH + 'dev.json')