## Entities and pair-relationships datasets

In [268]:
## imports

# data parsing and processing
import pandas as pd
import numpy as np
import os
import glob
import random
import re
import xml.etree.ElementTree as ET
import json

# ML learning
from sklearn.model_selection import train_test_split
from transformers import InputExample

# source paths
DATA_PATH = "../data/"

In [314]:
# functions and classes
def parse_pgr_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    data = []
    
    for sentence in root.findall('sentence'):
        text = sentence.attrib['text']
        pairs = []
        
        for pair in sentence.findall('pair'):
            e1_id = pair.attrib['e1']
            e2_id = pair.attrib['e2']
            relation = pair.attrib['relation'] == 'true'
            pairs.append((e1_id, e2_id, relation))
        
        entities = {
            entity.attrib['id']: {
                'text': entity.attrib['text'],
                'type': entity.attrib['type'],
                'ontology_id': entity.attrib['ontology_id']
            }
            for entity in sentence.findall('entity')
        }
        
        for e1_id, e2_id, relation in pairs:
            data.append({
                'sentence': text,
                'entity1': entities[e1_id],
                'entity2': entities[e2_id],
                'relation': relation,
            })
    
    return data

def parse_ddi_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    data = []

    filename = os.path.splitext(os.path.basename(file_path))[0]
    
    for sentence in root.findall('.//sentence'):
        text = sentence.attrib['text']
        text_id = sentence.attrib['id']
        pairs = []
        
        for pair in sentence.findall('.//pair'):
            e1_id = pair.attrib['e1']
            e2_id = pair.attrib['e2']
            ddi = pair.attrib['ddi'] == 'true'
            ddi_type = pair.attrib.get('type', 'unknown')
            pairs.append((e1_id, e2_id, ddi, ddi_type))
        
        entities = {}
        for entity in sentence.findall('.//entity'):
            char_offset_str = entity.attrib['charOffset']
            entity_spans = char_offset_str.split(';')
            spans = []
            for span in entity_spans:
                start, end = map(int, span.split('-'))
                spans.append((start, end))
            if spans:
                entities[entity.attrib['id']] = {
                    'text': entity.attrib['text'],
                    'type': entity.attrib['type'],
                    'char_offset': spans
                }
            else:
                entities[entity.attrib['id']] = {
                    'text': entity.attrib['text'],
                    'type': entity.attrib['type'],
                    'char_offset': None
                }
        
        for e1_id, e2_id, ddi, ddi_type in pairs:
            data.append({
                'filename': filename,
                'sentence': text,
                'sent_id': text_id,
                'entity1': {
                    'text': entities[e1_id]['text'],
                    'type': entities[e1_id]['type'],
                    'char_offset': entities[e1_id]['char_offset']
                },
                'entity2': {
                    'text': entities[e2_id]['text'],
                    'type': entities[e2_id]['type'],
                    'char_offset': entities[e2_id]['char_offset']
                },
                'ddi': ddi,
                'type': ddi_type,
                'all_ents': entities
            })
    
    return data


def save_data(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)

def preprocess_data(*corpus_paths, parse_function):
    """
    Preprocess corpora of XML files for entity extraction with provided paths and parsing function.
    Returns a list of sentences with entity/relationship pairs
    """
    data = []
    for corpus_path in corpus_paths:
        for file in os.listdir(corpus_path):
            if file.endswith('.xml'):
                file_path = os.path.join(corpus_path, file)
                data.extend(parse_function(file_path))
    
    return data

def create_train_val_split(data, val_split=0.1):
    """
    shuffles and splits the data 
    """
    random.shuffle(data)
    val_size = int(len(data) * val_split)
    train_data = data[:-val_size]
    val_data = data[-val_size:]
    return train_data, val_data

def extract_ner_data(data):
    ner_data = []
    for instance in data:
        input_data = {}
        input_data['sentence'] = instance['sentence']
        input_data['entities'] = []
        for value in instance['all_ents'].values():
            input_data['entities'].append(value)
        ner_data.append(input_data)
    return ner_data


def generate_conll_format(sentence, entities):
    # Initialize an empty list to store token labels
    labels = ['O'] * len(sentence)

    # Assign labels using character offsets
    for entity in entities:
        span = entity['char_offset']
        for start, end in span:
            if start < len(sentence) and end <= len(sentence):
                labels[start] = f"B-{entity['type']}"
                for i in range(start + 1, end):
                    labels[i] = f"I-{entity['type']}"

    # Tokenize the sentence using regex to split on whitespace or special characters
    pattern = r"(\w+|\S)"
    tokens = [match.group() for match in re.finditer(pattern, sentence)]

    # Combine tokens and labels into the CoNLL format
    conll_format = {}
    conll_format['tokens'] = []
    conll_format['tags'] = []
    token_start = 0
    for token in tokens:
        token_start = sentence.find(token, token_start)
        token_end = token_start + len(token) - 1
        token_label = labels[token_start:token_end+1]
        
        # Find the main label by checking if there are any "B-" or "I-" labels in the token_label list
        main_label = next((label for label in token_label if label.startswith("B-") or label.startswith("I-")), 'O')
        
        #conll_format += f"{token} {main_label}\n"
        conll_format['tokens'].append(token)
        conll_format['tags'].append(main_label)
        token_start = token_end + 1

    return conll_format

class REDataset(Dataset): # Relation Extraction
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        sentence = item['sentence']
        entity1 = item['entity1']
        entity2 = item['entity2']
        relation = int(item['relation'])

        # Tokenize the text, add special tokens [CLS] and [SEP]
        inputs = self.tokenizer.encode_plus(
            f"{entity1} [SEP] {entity2} [SEP] {sentence}",
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(relation, dtype=torch.long),
        }
def tokenize_and_preserve_labels(example, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword.
    """

    tokenized_sentence = []
    labels = []

    tokens = example["tokens"]
    text_labels = example["tags"]

    if len(tokens) != len(text_labels):
        raise ValueError("Length of tokens and text_labels should be the same")

    for word, label in zip(tokens, text_labels):

        # tokenize word and count # of subword tokens
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # add tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # add label and multiply by subword length
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels    

In [270]:
CORPORA_PATH = "raw/pgr-crowd/corpora/"

# Preprocess training data
train_folder = DATA_PATH + CORPORA_PATH + "amazon_train/"
train_data = preprocess_data(train_folder, parse_function=parse_pgr_xml)

# Preprocess test datapg
test_folder = DATA_PATH + CORPORA_PATH + "consensus_test/"
test_data = preprocess_data(test_folder, parse_function=parse_pgr_xml)

# Split the data into training and validation sets (80% training, 20% validation)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)
len(train_data), len(val_data), len(test_data)

(3602, 901, 1792)

In [272]:
SAVE_PATH = DATA_PATH + "preprocessed/pgr-crowd/"

# save the data to files
save_data(train_data, SAVE_PATH + 'train.json')
save_data(test_data, SAVE_PATH + 'test.json')
save_data(val_data, SAVE_PATH + 'val.json')

In [55]:
# Hyperparameters
batch_size = 16
num_epochs = 3
learning_rate = 2e-5
max_length = 128

# Load pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create Dataset and DataLoader
train_dataset = REDataset(train_data, tokenizer, max_length)
val_dataset = REDataset(val_data, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    # Evaluation loop
    model.eval()
    total_eval_loss = 0
    total_eval_accuracy = 0
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask = attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            
        total_eval_loss += loss.item()
        
        preds = torch.argmax(logits, dim=1).detach.cpu().numpy()
        true_labels = labels.detach().cpu().numpy()
        total_eval_accuray += (preds == true_labels).sum()
        
    arg_val_loss = total_eval_loss / len(val_dataloader)
    arg_val_accuracy = total_eval_accuracy / len(val_dataset)
    print(f"Epoch {epoch + 1}/{num_epochs}: Val Loss = {avg_val_loss:.4f}, Val Acc = {avg_val_accuracy:.4f}")

# Save the fine-tuned model
model.save_pretrained("bert_relation_extraction")
tokenizer.save_pretrained("bert_relation_extraction")

{'sentence': 'Genome-wide association studies have identified loci at 15q25 (iron ion transport) and 4q22 (regulation of small GTPase mediated signal transduction), associated with lung cancer (LC) and chronic obstructive pulmonary disease (intracellular protein transport).',
 'entity1': 'regulation of small GTPase mediated signal transduction',
 'entity2': 'cancer',
 'relation': True}

# DDI DrugBank

In [273]:
CORPORA_PATH = "raw/ddi-corpus/APIforDDICorpus/DDICorpus/"

# Preprocess training data
train_folder_drugbank = DATA_PATH + CORPORA_PATH + "Train/DrugBank/"
train_folder_medline = DATA_PATH + CORPORA_PATH + "Train/Medline/"
train_data = preprocess_data(train_folder_drugbank, train_folder_medline, parse_function = parse_ddi_xml)

# # Preprocess test data
test_folder_drugbank = DATA_PATH + CORPORA_PATH + "Test/Test for DrugNER task/DrugBank/"
test_folder_medline = DATA_PATH + CORPORA_PATH + "Test/Test for DrugNER task/MedLine/"

# Preprocess test data
test_data = preprocess_data(test_folder_drugbank, test_folder_medline, parse_function=parse_ddi_xml)
# # Split the data into training and validation sets (80% training, 20% validation)
train_data, val_data = create_train_val_split(train_data)
len(train_data), len(val_data), len(test_data)

(25013, 2779, 941)

In [280]:
for data in train_data[:10]:
    print(data)

{'filename': 'Duloxetine_ddi', 'sentence': 'Therefore, co-administration of Duloxetine with other drugs that are extensively metabolized by this isozyme and which have a narrow therapeutic index, including certain antidepressants (tricyclic antidepressants [TCAs], such as nortriptyline, amitriptyline, and imipramine), phenothiazines and Type 1C antiarrhythmics (e.g., propafenone, flecainide), should be approached with caution.', 'sent_id': 'DDI-DrugBank.d548.s9', 'entity1': {'text': 'Duloxetine', 'type': 'drug', 'char_offset': [(32, 41)]}, 'entity2': {'text': 'nortriptyline', 'type': 'drug', 'char_offset': [(229, 241)]}, 'ddi': True, 'type': 'advise', 'all_ents': {'DDI-DrugBank.d548.s9.e0': {'text': 'Duloxetine', 'type': 'drug', 'char_offset': [(32, 41)]}, 'DDI-DrugBank.d548.s9.e1': {'text': 'antidepressants', 'type': 'group', 'char_offset': [(170, 184)]}, 'DDI-DrugBank.d548.s9.e2': {'text': 'tricyclic antidepressants', 'type': 'group', 'char_offset': [(187, 211)]}, 'DDI-DrugBank.d548.

In [324]:
conll_sents_train = []
ner_data = extract_ner_data(train_data)
for data in ner_data:
    conll_sents_train.append(generate_conll_format(data['sentence'], data['entities']))
    
conll_sents_dev = []
ner_data = extract_ner_data(val_data)
for data in ner_data:
    conll_sents_dev.append(generate_conll_format(data['sentence'], data['entities']))
    
conll_sents_test = []
ner_data = extract_ner_data(test_data)
for data in ner_data:
    conll_sents_test.append(generate_conll_format(data['sentence'], data['entities']))

In [325]:
len(conll_sents_train), len(conll_sents_dev), len(conll_sents_test)

(25013, 2779, 941)

In [326]:
SAVE_PATH = DATA_PATH + "preprocessed/ddi-corpus/drug_ner/"
# save the data to files
save_data(conll_sents_train, SAVE_PATH + 'train.json')
save_data(conll_sents_test, SAVE_PATH + 'test.json')
save_data(conll_sents_dev, SAVE_PATH + 'dev.json')

In [309]:
conll_sents_train[0]

{'tokens': ['Therefore',
  ',',
  'co',
  '-',
  'administration',
  'of',
  'Duloxetine',
  'with',
  'other',
  'drugs',
  'that',
  'are',
  'extensively',
  'metabolized',
  'by',
  'this',
  'isozyme',
  'and',
  'which',
  'have',
  'a',
  'narrow',
  'therapeutic',
  'index',
  ',',
  'including',
  'certain',
  'antidepressants',
  '(',
  'tricyclic',
  'antidepressants',
  '[',
  'TCAs',
  ']',
  ',',
  'such',
  'as',
  'nortriptyline',
  ',',
  'amitriptyline',
  ',',
  'and',
  'imipramine',
  ')',
  ',',
  'phenothiazines',
  'and',
  'Type',
  '1C',
  'antiarrhythmics',
  '(',
  'e',
  '.',
  'g',
  '.',
  ',',
  'propafenone',
  ',',
  'flecainide',
  ')',
  ',',
  'should',
  'be',
  'approached',
  'with',
  'caution',
  '.'],
 'tags': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-drug',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-group',
  'O',
  'B-group',
  'I-group'

In [312]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

#model_checkpoint = "path/to/your/fine-tuned/scibert/model"
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased')

# text = "Your text from the DDI corpus"
# inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
# outputs = model(**inputs)
# predictions = torch.argmax(outputs.logits, dim=-1)

# # Convert token ids back to tokens and labels
# tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())
# label_ids = predictions.squeeze().tolist()
# labels = [model.config.id2label[label_id] for label_id in label_ids]

# # Post-process to obtain the final entity predictions
# for token, label in zip(tokens, labels):
#     if label != "O":
#         print(f"{token} - {label}")


loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /Users/tylerpoore/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncase

In [319]:
train_data[0]

{'filename': 'Duloxetine_ddi',
 'sentence': 'Therefore, co-administration of Duloxetine with other drugs that are extensively metabolized by this isozyme and which have a narrow therapeutic index, including certain antidepressants (tricyclic antidepressants [TCAs], such as nortriptyline, amitriptyline, and imipramine), phenothiazines and Type 1C antiarrhythmics (e.g., propafenone, flecainide), should be approached with caution.',
 'sent_id': 'DDI-DrugBank.d548.s9',
 'entity1': {'text': 'Duloxetine', 'type': 'drug', 'char_offset': [(32, 41)]},
 'entity2': {'text': 'nortriptyline',
  'type': 'drug',
  'char_offset': [(229, 241)]},
 'ddi': True,
 'type': 'advise',
 'all_ents': {'DDI-DrugBank.d548.s9.e0': {'text': 'Duloxetine',
   'type': 'drug',
   'char_offset': [(32, 41)]},
  'DDI-DrugBank.d548.s9.e1': {'text': 'antidepressants',
   'type': 'group',
   'char_offset': [(170, 184)]},
  'DDI-DrugBank.d548.s9.e2': {'text': 'tricyclic antidepressants',
   'type': 'group',
   'char_offset': [

In [322]:
data = conll_sents_train[0]

{'tokens': ['Therefore',
  ',',
  'co',
  '-',
  'administration',
  'of',
  'Duloxetine',
  'with',
  'other',
  'drugs',
  'that',
  'are',
  'extensively',
  'metabolized',
  'by',
  'this',
  'isozyme',
  'and',
  'which',
  'have',
  'a',
  'narrow',
  'therapeutic',
  'index',
  ',',
  'including',
  'certain',
  'antidepressants',
  '(',
  'tricyclic',
  'antidepressants',
  '[',
  'TCAs',
  ']',
  ',',
  'such',
  'as',
  'nortriptyline',
  ',',
  'amitriptyline',
  ',',
  'and',
  'imipramine',
  ')',
  ',',
  'phenothiazines',
  'and',
  'Type',
  '1C',
  'antiarrhythmics',
  '(',
  'e',
  '.',
  'g',
  '.',
  ',',
  'propafenone',
  ',',
  'flecainide',
  ')',
  ',',
  'should',
  'be',
  'approached',
  'with',
  'caution',
  '.'],
 'tags': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-drug',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-group',
  'O',
  'B-group',
  'I-group'

In [323]:
tokens, tags = tokenize_and_preserve_labels(conll_sents_train[0], tokenizer)
for i in range(len(tokens)):
    print(tokens[i], tags[i])

therefore O
, O
co O
- O
administration O
of O
du B-drug
##lo B-drug
##x B-drug
##eti B-drug
##ne B-drug
with O
other O
drugs O
that O
are O
extensively O
meta O
##bol O
##ized O
by O
this O
iso O
##zy O
##me O
and O
which O
have O
a O
narrow O
therapeutic O
index O
, O
including O
certain O
anti B-group
##de B-group
##press B-group
##ants B-group
( O
tri B-group
##cy B-group
##cl B-group
##ic B-group
anti I-group
##de I-group
##press I-group
##ants I-group
[ O
tc B-group
##as B-group
] O
, O
such O
as O
nor B-drug
##trip B-drug
##ty B-drug
##line B-drug
, O
ami B-drug
##trip B-drug
##ty B-drug
##line B-drug
, O
and O
im B-drug
##ip B-drug
##ram B-drug
##ine B-drug
) O
, O
ph B-group
##eno B-group
##thi B-group
##azi B-group
##nes B-group
and O
type B-group
1 I-group
##c I-group
anti I-group
##ar I-group
##rh I-group
##yt I-group
##hmi I-group
##cs I-group
( O
e O
. O
g O
. O
, O
prop B-drug
##af B-drug
##eno B-drug
##ne B-drug
, O
fl B-drug
##eca B-drug
##ini B-drug
##de B-drug
) O
, 

In [239]:
import re

regex_pattern = r"(\w+|\W)"

sentence = "This is an example sentence! It contains words and non-word characters."

result = re.findall(regex_pattern, sentence)

print(result)

['This', ' ', 'is', ' ', 'an', ' ', 'example', ' ', 'sentence', '!', ' ', 'It', ' ', 'contains', ' ', 'words', ' ', 'and', ' ', 'non', '-', 'word', ' ', 'characters', '.']


In [261]:
test_input = {'filename': 'Clonazepam_ddi',
  'sentence': 'Pharmacodynamic Interactions: The CNS-depressant action of the benzodiazepine class of drugs may be potentiated by alcohol, narcotics, barbiturates, nonbarbiturate hypnotics, antianxiety agents, the phenothiazines, thioxanthene and butyrophenone classes of antipsychotic agents, monoamine oxidase inhibitors and the tricyclic antidepressants, and by other anticonvulsant drugs.',
  'sent_id': 'DDI-DrugBank.d333.s7',
  'entity1': {'text': 'nonbarbiturate hypnotics',
   'type': 'group',
   'char_offset': [(149, 172)]},
  'entity2': {'text': 'anticonvulsant drugs',
   'type': 'group',
   'char_offset': [(356, 375)]},
  'ddi': False,
  'type': 'unknown',
  'all_ents': {'DDI-DrugBank.d333.s7.e0': {'text': 'benzodiazepine class',
    'type': 'group',
    'char_offset': [(63, 82)]},
   'DDI-DrugBank.d333.s7.e1': {'text': 'alcohol',
    'type': 'drug',
    'char_offset': [(115, 121)]},
   'DDI-DrugBank.d333.s7.e2': {'text': 'narcotics',
    'type': 'group',
    'char_offset': [(124, 132)]},
   'DDI-DrugBank.d333.s7.e3': {'text': 'barbiturates',
    'type': 'group',
    'char_offset': [(135, 146)]},
   'DDI-DrugBank.d333.s7.e4': {'text': 'nonbarbiturate hypnotics',
    'type': 'group',
    'char_offset': [(149, 172)]},
   'DDI-DrugBank.d333.s7.e5': {'text': 'antianxiety agents',
    'type': 'group',
    'char_offset': [(175, 192)]},
   'DDI-DrugBank.d333.s7.e6': {'text': 'phenothiazines classes of antipsychotic agents',
    'type': 'group',
    'char_offset': [(199, 212), (246, 276)]},
   'DDI-DrugBank.d333.s7.e7': {'text': 'thioxanthene classes of antipsychotic agents',
    'type': 'group',
    'char_offset': [(215, 226), (246, 276)]},
   'DDI-DrugBank.d333.s7.e8': {'text': 'butyrophenone classes of antipsychotic agents',
    'type': 'group',
    'char_offset': [(232, 276)]},
   'DDI-DrugBank.d333.s7.e9': {'text': 'monoamine oxidase inhibitors',
    'type': 'group',
    'char_offset': [(279, 306)]},
   'DDI-DrugBank.d333.s7.e10': {'text': 'tricyclic antidepressants',
    'type': 'group',
    'char_offset': [(316, 340)]},
   'DDI-DrugBank.d333.s7.e11': {'text': 'anticonvulsant drugs',
    'type': 'group',
    'char_offset': [(356, 375)]}}}

In [134]:
preprocess_for_transformer([{'filename': 'Hydrochlorothiazide_ddi',
  'sentence': 'Non-steroidal Anti-inflammatory Drugs: In some patients, the administration of a non-steroidal anti-inflammatory agent can reduce the diuretic, natriuretic, and antihypertensive effects of loop, potassium-sparing and thiazide diuretics.',
  'sent_id': 'DDI-DrugBank.d162.s12',
  'entity1': {'text': 'loop diuretics',
   'type': 'group',
   'char_offset': [(189, 192), (226, 234)]},
  'entity2': {'text': 'thiazide diuretics',
   'type': 'group',
   'char_offset': [(217, 234)]},
  'ddi': False,
  'type': 'unknown'}])

TypeError: __init__() got an unexpected keyword argument 'data'

In [112]:
filenames = []
sentences = []
entity1 = []
entity2 = []
for file in train_data:
    filenames.append(file['filename'])
    sentences.append(file['sentence'])
    entity1.append(file['entity1'])
    entity2.append(file['entity2'])
    
for file in val_data:
    filenames.append(file['filename'])
    sentences.append(file['sentence'])
    entity1.append(file['entity1'])
    entity2.append(file['entity2'])
    
# for file in test_data:
#     filenames.append(file['filename'])
#     sentences.append(file['sentence'])
#     entity1.append(file['entity1'])
#     entity2.append(file['entity2'])

In [113]:
df = pd.DataFrame({
    'filename':filenames,
    'sentence':sentences,
    'ent_1':entity1,

    'ent_2':entity2,

    })

In [114]:
df[['ent_1_text', 'ent_1_type', 'ent_1_offset']] = df['ent_1'].apply(lambda x: extract_entity_info(x))
df[['ent_2_text', 'ent_2_type', 'ent_2_offset']] = df['ent_2'].apply(lambda x: extract_entity_info(x))

In [115]:
df[['ent_1_type', 'ent_2_type']].value_counts()

ent_1_type  ent_2_type
drug        drug          13043
group       drug           3646
            group          2961
drug        group          2900
brand       drug           1479
drug        brand          1162
brand       brand           802
group       brand           487
brand       group           472
drug        drug_n          288
drug_n      drug_n          234
            drug            224
group       drug_n           40
drug_n      group            39
brand       drug_n           10
drug_n      brand             5
dtype: int64

In [116]:
df[df['ent_1_offset'].apply(len) > 1]

Unnamed: 0,filename,sentence,ent_1,ent_2,ent_1_text,ent_1_type,ent_1_offset,ent_2_text,ent_2_type,ent_2_offset
782,Hydrochlorothiazide_ddi,Non-steroidal Anti-inflammatory Drugs: In some...,"{'text': 'loop diuretics', 'type': 'group', 'c...","{'text': 'thiazide diuretics', 'type': 'group'...",loop diuretics,group,"[(189, 192), (226, 234)]",thiazide diuretics,group,"[(217, 234)]"
935,Indomethacin_ddi,"In some patients, the administration of INDOCI...","{'text': 'loop diuretics', 'type': 'group', 'c...","{'text': 'potassium-sparing diuretics', 'type'...",loop diuretics,group,"[(118, 121), (156, 164)]",potassium-sparing diuretics,group,"[(124, 140), (156, 164)]"
2670,Clonazepam_ddi,Pharmacodynamic Interactions: The CNS-depressa...,{'text': 'phenothiazines classes of antipsycho...,"{'text': 'tricyclic antidepressants', 'type': ...",phenothiazines classes of antipsychotic agents,group,"[(199, 212), (246, 276)]",tricyclic antidepressants,group,"[(316, 340)]"
2882,Clonazepam_ddi,Pharmacodynamic Interactions: The CNS-depressa...,{'text': 'thioxanthene classes of antipsychoti...,"{'text': 'monoamine oxidase inhibitors', 'type...",thioxanthene classes of antipsychotic agents,group,"[(215, 226), (246, 276)]",monoamine oxidase inhibitors,group,"[(279, 306)]"
3456,11121884,The percentage of neurons hyperpolarized by mu...,"{'text': 'mu-selective opioids', 'type': 'grou...","{'text': 'delta(1)-selective opioids', 'type':...",mu-selective opioids,group,"[(44, 46), (70, 86)]",delta(1)-selective opioids,group,"[(49, 57), (70, 86)]"
...,...,...,...,...,...,...,...,...,...,...
24561,Dexamethasone_ddi,Vaccines: Patients on corticosteroid therapy m...,"{'text': 'live vaccines', 'type': 'group', 'ch...","{'text': 'inactivated vaccines', 'type': 'grou...",live vaccines,group,"[(94, 97), (114, 121)]",inactivated vaccines,group,"[(102, 121)]"
24687,Dofetilide_ddi,If a patient requires TIKOSYN and anti-ulcer t...,"{'text': 'aluminum hydroxide', 'type': 'drug',...","{'text': 'magnesium hydroxide', 'type': 'drug'...",aluminum hydroxide,drug,"[(112, 119), (135, 143)]",magnesium hydroxide,drug,"[(125, 143)]"
25933,Clonazepam_ddi,Pharmacodynamic Interactions: The CNS-depressa...,{'text': 'phenothiazines classes of antipsycho...,{'text': 'butyrophenone classes of antipsychot...,phenothiazines classes of antipsychotic agents,group,"[(199, 212), (246, 276)]",butyrophenone classes of antipsychotic agents,group,"[(232, 276)]"
26942,Hydroflumethiazide_ddi,"(In some patients, the steroidal anti-inflamma...","{'text': 'loop diuretics', 'type': 'group', 'c...","{'text': 'thiazide diuretics', 'type': 'group'...",loop diuretics,group,"[(127, 130), (165, 173)]",thiazide diuretics,group,"[(156, 173)]"


In [34]:
df['ent_2_offset'].str.split('-')

0          [70, 73]
1        [326, 342]
2          [34, 45]
3        [413, 423]
4          [73, 80]
            ...    
28728    [134, 138]
28729    [134, 138]
28730      [39, 43]
28731      [46, 49]
28732      [46, 49]
Name: ent_2_offset, Length: 28733, dtype: object

In [14]:
# Convert list of dictionaries to DataFrame
df = pd.DataFrame(train_data)

# Flatten entity1 and entity2 dictionaries
df = pd.concat([df.drop(['entity1'], axis=1), df['entity1'].apply(pd.Series)], axis=1)
df = pd.concat([df.drop(['entity2'], axis=1), df['entity2'].apply(pd.Series)], axis=1)

# Reorder columns
df = df[['filename', 'sentence', 'text_x', 'type_x', 'char_offset_x', 'text_y', 'type_y', 'char_offset_y', 'ddi', 'type']]
df.columns = ['filename', 'sentence', 'entity1', 'entity1_type', 'entity1_char_offset', 'entity2', 'entity2_type', 'entity2_char_offset', 'ddi', 'ddi_type']

# Print DataFrame
print(df)

KeyError: "['text_x', 'type_x', 'char_offset_x', 'text_y', 'type_y', 'char_offset_y'] not in index"