In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Bidirectional
from keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


In [6]:
# Define the training data
train_data = [
    ("Patient has a history of diabetes.", [(0, 7, "PROBLEM")]),
    ("The patient is allergic to penicillin.", [(20, 30, "TREATMENT")]),
    ("The patient has a family history of heart disease.", [(23, 36, "PROBLEM")]),
    ("The patient is taking aspirin for pain relief.", [(20, 26, "TREATMENT")]),
    ("The patient has a history of asthma.", [(0, 7, "PROBLEM")])
]

# Define the labels
labels = {"PROBLEM": 0, "TREATMENT": 1}


Consider the first sentence in train_data: "Patient has a history of diabetes." The corresponding entity is "(0, 7, "PROBLEM")", which indicates that the entity is a problem related to the patient's medical history, and it starts at position 0 and ends at position 7 in the sentence. 

To label this sentence using the BIO tagging scheme, we would first split the sentence into tokens: ["Patient", "has", "a", "history", "of", "diabetes", "."]. Then, we would label each token with a tag based on its position in the entity. Since the entity starts at position 0, the first token ("Patient") would be labeled "B-PROBLEM", indicating that it is the beginning of the entity. 

The next token ("has") is outside the entity, so it is labeled "O". The next token ("a") is also outside the entity, so it is labeled "O". The next token ("history") is inside the entity, so it is labeled "I-PROBLEM". The next token ("of") is inside the entity, so it is also labeled "I-PROBLEM". 

The next token ("diabetes") is inside the entity, so it is also labeled "I-PROBLEM". Finally, the last token (".") is outside the entity, so it is labeled "O". The resulting sequence of labels for this sentence would be: ["B-PROBLEM", "O", "O", "I-PROBLEM", "I-PROBLEM", "I-PROBLEM", "O"].

In [7]:
# Define the vocabulary
vocab = {"<PAD>": 0, "<UNK>": 1, "Patient": 2, "has": 3, "a": 4, "history": 5, "of": 6, "diabetes": 7, ".": 8, "The": 9, "patient": 10, "is": 11, "allergic": 12, "to": 13, "penicillin": 14, "family": 15, "heart": 16, "disease": 17, "taking": 18, "aspirin": 19, "for": 20, "pain": 21, "relief": 22, "asthma": 23}

# Define the maximum sequence length
max_len = 50


In [8]:
# Convert the training data to sequences of word indices and label indices
X = []
y = []
for sentence, entities in train_data:
    X_seq = [vocab.get(word, vocab["<UNK>"]) for word in sentence.split()]
    y_seq = np.zeros(len(X_seq))
    for entity in entities:
        start, end, label = entity
        start_idx = len(sentence[:start].split())
        end_idx = len(sentence[:end].split()) - 1
        for i in range(start_idx, end_idx + 1):
            y_seq[i] = labels[label]
    X.append(X_seq)
    y.append(y_seq)


In [9]:
# Pad the sequences to the maximum length
X = pad_sequences(X, maxlen=max_len, padding="post", value=vocab["<PAD>"])
y = pad_sequences(y, maxlen=max_len, padding="post", value=-1)

# Convert the label indices to one-hot vectors
y = to_categorical(y, num_classes=len(labels))


In [10]:
# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=len(vocab), output_dim=128, input_length=max_len))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(len(labels), activation="softmax")))


2023-05-15 18:42:54.998350: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-15 18:42:55.005439: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-15 18:42:55.009410: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [11]:
# Compile the model
model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=0.001), metrics=["accuracy"])

# Train the model
model.fit(X, y, batch_size=32, epochs=10, validation_split=0.2)



Epoch 1/10


  super().__init__(name, **kwargs)
2023-05-15 18:43:03.356519: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-15 18:43:03.364033: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-15 18:43:03.366214: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this 



2023-05-15 18:43:10.284583: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-15 18:43:10.288449: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-15 18:43:10.294587: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fec74322250>

In [13]:
# Define the test data
test_data = [
    "The patient has a history of heart disease.",
    "The patient is allergic to penicillin.",
    "The patient is taking aspirin for pain relief."
]



In [14]:
# Convert the test data to sequences of word indices
X_test = []
for sentence in test_data:
    X_test_seq = [vocab.get(word, vocab["<UNK>"]) for word in sentence.split()]
    X_test.append(X_test_seq)



In [15]:
# Pad the sequences to the maximum length
X_test = pad_sequences(X_test, maxlen=max_len, padding="post", value=vocab["<PAD>"])

# Make predictions on the test data
y_pred = model.predict(X_test)

# Convert the predicted label probabilities to label indices
y_pred = np.argmax(y_pred, axis=-1)



2023-05-15 18:43:48.600454: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-15 18:43:48.603349: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-15 18:43:48.609910: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



In [16]:
# Convert the label indices to labels
labels_inv = {v: k for k, v in labels.items()}
y_pred_labels = [[labels_inv.get(idx, "O") for idx in seq] for seq in y_pred]

# Print the predicted entities for each sentence
for i, sentence in enumerate(test_data):
    print(f"Sentence {i+1}: {sentence}")
    for j, label in enumerate(y_pred_labels[i]):
        if label != "O":
            print(f"Entity {j+1}: {label}")
    print()




Sentence 1: The patient has a history of heart disease.
Entity 1: TREATMENT
Entity 2: TREATMENT
Entity 3: TREATMENT
Entity 4: TREATMENT
Entity 5: TREATMENT
Entity 6: TREATMENT
Entity 7: TREATMENT
Entity 8: TREATMENT
Entity 9: TREATMENT
Entity 10: TREATMENT
Entity 11: TREATMENT
Entity 12: TREATMENT
Entity 13: TREATMENT
Entity 14: TREATMENT
Entity 15: TREATMENT
Entity 16: TREATMENT
Entity 17: TREATMENT
Entity 18: TREATMENT
Entity 19: TREATMENT
Entity 20: TREATMENT
Entity 21: TREATMENT
Entity 22: TREATMENT
Entity 23: TREATMENT
Entity 24: TREATMENT
Entity 25: TREATMENT
Entity 26: TREATMENT
Entity 27: TREATMENT
Entity 28: TREATMENT
Entity 29: TREATMENT
Entity 30: TREATMENT
Entity 31: TREATMENT
Entity 32: TREATMENT
Entity 33: TREATMENT
Entity 34: TREATMENT
Entity 35: TREATMENT
Entity 36: TREATMENT
Entity 37: TREATMENT
Entity 38: TREATMENT
Entity 39: TREATMENT
Entity 40: TREATMENT
Entity 41: TREATMENT
Entity 42: TREATMENT
Entity 43: TREATMENT
Entity 44: TREATMENT
Entity 45: TREATMENT
Entity 4