In [2]:
import ast
from collections import defaultdict
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer

In [2]:
import pandas as pd

# Replace with your CSV file path
df = pd.read_csv("/content/drive/MyDrive/paper-implementations/healthcare_dataset.csv")

print(df.head())
print(df.columns)


            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medication  \
0    18856.281306    

In [None]:
df.shape

(55500, 15)

In [None]:
# load tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
import pandas as pd
import re

# Load the original dataset with PHI columns
df = pd.read_csv("/content/drive/MyDrive/paper-implementations/healthcare_dataset.csv")

# Define PHI columns
PHI_COLUMNS = [
    "Name",
    "Date of Admission",
    "Doctor",
    "Hospital",
    "Insurance Provider",
    "Room Number",
    "Discharge Date"
]

# Function: Convert a row to free-text
def row_to_sentence(row):
    sentence = (
        f"Patient {row['Name']}, a {row['Age']}-year-old {row['Gender']} with {row['Medical Condition']}, "
        f"was admitted on {row['Date of Admission']} to {row['Hospital']} hospital under the care of Dr. {row['Doctor']}. "
        f"The patient was prescribed {row['Medication']} and had test results marked as {row['Test Results']}. "
        f"He was discharged on {row['Discharge Date']}."
    )
    return sentence

# BIOES tagging function
def bioes_label_sentence(sentence, row):
    words = sentence.split()
    labels = ["O"] * len(words)

    for col in PHI_COLUMNS:
        value = str(row[col])
        value_tokens = value.split()

        for i in range(len(words)):
            # crude matching (ignores punctuation/case)
            if re.sub(r'[^\w\s]', '', words[i]).lower() == value_tokens[0].lower():
                span_len = len(value_tokens)

                if span_len == 1:
                    labels[i] = f"S-{col.upper()}"
                else:
                    labels[i] = f"B-{col.upper()}"
                    for j in range(1, span_len-1):
                        if i+j < len(labels):
                            labels[i+j] = f"I-{col.upper()}"
                    if i+span_len-1 < len(labels):
                        labels[i+span_len-1] = f"E-{col.upper()}"
    return words, labels

# Apply
processed = []
for _, row in df.iterrows():
    text = row_to_sentence(row)
    words, labels = bioes_label_sentence(text, row)
    processed.append({"tokens": words, "labels": labels, "text": text})

processed_df = pd.DataFrame(processed)

# Save token-level dataset
processed_df.to_csv("/content/drive/MyDrive/paper-implementations/annotated_notes.csv", index=False)
print(processed_df.head(1))

                                              tokens  \
0  [Patient, Bobby, JacksOn,, a, 30-year-old, Mal...   

                                              labels  \
0  [O, B-NAME, E-NAME, O, O, O, O, O, O, O, O, O,...   

                                                text  
0  Patient Bobby JacksOn, a 30-year-old Male with...  


In [3]:
import pandas as pd

annotated_file = '/content/drive/MyDrive/paper-implementations/annotated_notes.csv'
df = pd.read_csv(annotated_file)
print(df.head())
print(f"Total rows: {len(df)}")

                                              tokens  \
0  ['Patient', 'Bobby', 'JacksOn,', 'a', '30-year...   
1  ['Patient', 'LesLie', 'TErRy,', 'a', '62-year-...   
2  ['Patient', 'DaNnY', 'sMitH,', 'a', '76-year-o...   
3  ['Patient', 'andrEw', 'waTtS,', 'a', '28-year-...   
4  ['Patient', 'adrIENNE', 'bEll,', 'a', '43-year...   

                                              labels  \
0  ['O', 'B-NAME', 'E-NAME', 'O', 'O', 'O', 'O', ...   
1  ['O', 'B-NAME', 'E-NAME', 'O', 'O', 'O', 'O', ...   
2  ['O', 'B-NAME', 'E-NAME', 'O', 'O', 'O', 'O', ...   
3  ['O', 'B-NAME', 'E-NAME', 'O', 'O', 'O', 'O', ...   
4  ['O', 'B-NAME', 'E-NAME', 'O', 'O', 'O', 'O', ...   

                                                text  
0  Patient Bobby JacksOn, a 30-year-old Male with...  
1  Patient LesLie TErRy, a 62-year-old Male with ...  
2  Patient DaNnY sMitH, a 76-year-old Female with...  
3  Patient andrEw waTtS, a 28-year-old Female wit...  
4  Patient adrIENNE bEll, a 43-year-old Female wi..

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tokens  55500 non-null  object
 1   labels  55500 non-null  object
 2   text    55500 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


In [5]:
df['tokens'] = df['tokens'].apply(ast.literal_eval)
df['labels'] = df['labels'].apply(ast.literal_eval)

In [6]:
# Vocabulary for tokens
token2idx = defaultdict(lambda: len(token2idx))
token2idx['<PAD>'] = 0  # Padding token
for tokens in df['tokens']:
    for token in tokens:
        token2idx[token]

In [7]:
# Vocabulary for labels
tag2idx = defaultdict(lambda: len(tag2idx))
tag2idx['<PAD>'] = 0  # Padding tag
for labels in df['labels']:
    for label in labels:
        tag2idx[label]

In [8]:
idx2token = {i: t for t, i in token2idx.items()}
idx2tag = {i: t for t, i in tag2idx.items()}

vocab_size = len(token2idx)
num_tags = len(tag2idx)
print(f"Vocabulary size: {vocab_size}, Number of tags: {num_tags}")

Vocabulary size: 70717, Number of tags: 11


In [9]:
# Prepare sequences function
def prepare_sequences(df, max_len=100):
    """
    Convert tokens and labels to padded sequences of indices.

    Args:
        df (pd.DataFrame): DataFrame with 'tokens' and 'labels' columns as lists.
        max_len (int): Maximum sequence length for padding (default 100).

    Returns:
        tuple: Padded sequences of token indices (X) and tag indices (y).
    """
    # Convert tokens to indices
    X = [[token2idx[t] for t in tokens] for tokens in df['tokens']]
    # Convert labels to indices
    y = [[tag2idx[l] for l in labels] for labels in df['labels']]

    # Pad sequences to max_len
    X = pad_sequences(X, maxlen=max_len, padding='post', value=token2idx['<PAD>'])
    y = pad_sequences(y, maxlen=max_len, padding='post', value=tag2idx['<PAD>'])

    return X, y

In [10]:
# Split
from sklearn.model_selection import train_test_split
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42)
X_train, y_train = prepare_sequences(train_df)
X_val, y_val = prepare_sequences(val_df)
X_test, y_test = prepare_sequences(test_df)

In [11]:
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

X_train shape: (35520, 100), y_train shape: (35520, 100)


In [12]:
import torch

# convert to pytorch tensors
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train_tensor = torch.tensor(X_train, dtype=torch.long).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.long).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.long).to(device)

In [13]:
import numpy as np
from torch.nn import Embedding, LSTM, GRU

# 1. Fixed Embeddings (GloVe)
def load_glove_embeddings(glove_file, vocab_size, embed_dim=300):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    embedding_matrix = np.zeros((vocab_size, embed_dim))
    for word, i in token2idx.items():
        if i >= vocab_size: continue
        vector = embeddings_index.get(word.lower())  # Case-insensitive
        if vector is not None:
            embedding_matrix[i] = vector
    return torch.tensor(embedding_matrix, dtype=torch.float32).to(device)

# Load GloVe
glove_file = '/content/drive/MyDrive/paper-implementations/glove.6B.300d.txt'  # Ensure this file is in your directory
glove_embeddings = load_glove_embeddings(glove_file, vocab_size)
print(f"GloVe embedding matrix shape: {glove_embeddings.shape}")

# 2. Character Embeddings
def create_char_embeddings(tokens, char_vocab_size=100, char_embed_dim=50):
    char2idx = defaultdict(lambda: len(char2idx))
    char2idx['<PAD>'] = 0
    for token_list in tokens:
        for token in token_list:
            for char in token:
                char2idx[char]

    # Create character sequences
    char_sequences = []
    max_char_len = 0
    for token_list in tokens:
        char_seq = [char2idx[char] for token in token_list for char in token]
        char_sequences.append(char_seq)
        max_char_len = max(max_char_len, len(char_seq))

    # Pad character sequences
    char_sequences_padded = pad_sequences(char_sequences, maxlen=max_char_len * 100, padding='post', value=char2idx['<PAD>'])
    char_embed = Embedding(len(char2idx), char_embed_dim).to(device)
    return char_sequences_padded, char_embed

char_sequences, char_embed = create_char_embeddings(df['tokens'])
print(f"Character sequences shape: {char_sequences.shape}, Embedding layer output shape: {char_embed.weight.shape}")

# Verify shapes
print(f"X_train_tensor shape: {X_train_tensor.shape}, y_train_tensor shape: {y_train_tensor.shape}")
print(f"GloVe embeddings shape: {glove_embeddings.shape}")
print(f"Character embeddings shape: {char_sequences.shape}, Character embed weight shape: {char_embed.weight.shape}")

GloVe embedding matrix shape: torch.Size([70717, 300])
Character sequences shape: (55500, 25300), Embedding layer output shape: torch.Size([66, 50])
X_train_tensor shape: torch.Size([35520, 100]), y_train_tensor shape: torch.Size([35520, 100])
GloVe embeddings shape: torch.Size([70717, 300])
Character embeddings shape: (55500, 25300), Character embed weight shape: torch.Size([66, 50])
