In [5]:
!pip install pytorch-crf

Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)
Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [1]:
import ast
from collections import defaultdict
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer
from torchcrf import CRF

In [3]:
import pandas as pd

# Replace with your CSV file path
df = pd.read_csv("/content/drive/MyDrive/paper-implementations/healthcare_dataset.csv")

print(df.head())
print(df.columns)


            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medication  \
0    18856.281306    

In [4]:
df.shape

(55500, 15)

In [5]:
# load tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
import pandas as pd
import re

# Load the original dataset with PHI columns
df = pd.read_csv("/content/drive/MyDrive/paper-implementations/healthcare_dataset.csv")

# Define PHI columns
PHI_COLUMNS = [
    "Name",
    "Date of Admission",
    "Doctor",
    "Hospital",
    "Insurance Provider",
    "Room Number",
    "Discharge Date"
]

# Function: Convert a row to free-text
def row_to_sentence(row):
    sentence = (
        f"Patient {row['Name']}, a {row['Age']}-year-old {row['Gender']} with {row['Medical Condition']}, "
        f"was admitted on {row['Date of Admission']} to {row['Hospital']} hospital under the care of Dr. {row['Doctor']}. "
        f"The patient was prescribed {row['Medication']} and had test results marked as {row['Test Results']}. "
        f"He was discharged on {row['Discharge Date']}."
    )
    return sentence

# BIOES tagging function
def bioes_label_sentence(sentence, row):
    words = sentence.split()
    labels = ["O"] * len(words)

    for col in PHI_COLUMNS:
        value = str(row[col])
        value_tokens = value.split()

        for i in range(len(words)):
            # crude matching (ignores punctuation/case)
            if re.sub(r'[^\w\s]', '', words[i]).lower() == value_tokens[0].lower():
                span_len = len(value_tokens)

                if span_len == 1:
                    labels[i] = f"S-{col.upper()}"
                else:
                    labels[i] = f"B-{col.upper()}"
                    for j in range(1, span_len-1):
                        if i+j < len(labels):
                            labels[i+j] = f"I-{col.upper()}"
                    if i+span_len-1 < len(labels):
                        labels[i+span_len-1] = f"E-{col.upper()}"
    return words, labels

# Apply
processed = []
for _, row in df.iterrows():
    text = row_to_sentence(row)
    words, labels = bioes_label_sentence(text, row)
    processed.append({"tokens": words, "labels": labels, "text": text})

processed_df = pd.DataFrame(processed)

# Save token-level dataset
processed_df.to_csv("/content/drive/MyDrive/paper-implementations/annotated_notes.csv", index=False)
print(processed_df.head(1))

In [2]:
import pandas as pd

annotated_file = '/content/drive/MyDrive/paper-implementations/annotated_notes.csv'
df = pd.read_csv(annotated_file)
print(df.head())
print(f"Total rows: {len(df)}")

                                              tokens  \
0  ['Patient', 'Bobby', 'JacksOn,', 'a', '30-year...   
1  ['Patient', 'LesLie', 'TErRy,', 'a', '62-year-...   
2  ['Patient', 'DaNnY', 'sMitH,', 'a', '76-year-o...   
3  ['Patient', 'andrEw', 'waTtS,', 'a', '28-year-...   
4  ['Patient', 'adrIENNE', 'bEll,', 'a', '43-year...   

                                              labels  \
0  ['O', 'B-NAME', 'E-NAME', 'O', 'O', 'O', 'O', ...   
1  ['O', 'B-NAME', 'E-NAME', 'O', 'O', 'O', 'O', ...   
2  ['O', 'B-NAME', 'E-NAME', 'O', 'O', 'O', 'O', ...   
3  ['O', 'B-NAME', 'E-NAME', 'O', 'O', 'O', 'O', ...   
4  ['O', 'B-NAME', 'E-NAME', 'O', 'O', 'O', 'O', ...   

                                                text  
0  Patient Bobby JacksOn, a 30-year-old Male with...  
1  Patient LesLie TErRy, a 62-year-old Male with ...  
2  Patient DaNnY sMitH, a 76-year-old Female with...  
3  Patient andrEw waTtS, a 28-year-old Female wit...  
4  Patient adrIENNE bEll, a 43-year-old Female wi..

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tokens  55500 non-null  object
 1   labels  55500 non-null  object
 2   text    55500 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


In [4]:
df['tokens'] = df['tokens'].apply(ast.literal_eval)
df['labels'] = df['labels'].apply(ast.literal_eval)

In [5]:
# Vocabulary for tokens
token2idx = defaultdict(lambda: len(token2idx))
token2idx['<PAD>'] = 0  # Padding token
for tokens in df['tokens']:
    for token in tokens:
        token2idx[token]

In [6]:
# Vocabulary for labels
tag2idx = defaultdict(lambda: len(tag2idx))
tag2idx['<PAD>'] = 0  # Padding tag
for labels in df['labels']:
    for label in labels:
        tag2idx[label]

In [7]:
idx2token = {i: t for t, i in token2idx.items()}
idx2tag = {i: t for t, i in tag2idx.items()}

vocab_size = len(token2idx)
num_tags = len(tag2idx)
print(f"Vocabulary size: {vocab_size}, Number of tags: {num_tags}")

Vocabulary size: 70717, Number of tags: 11


In [8]:
# Prepare sequences function
def prepare_sequences(df, max_len=100):
    """
    Convert tokens and labels to padded sequences of indices.

    Args:
        df (pd.DataFrame): DataFrame with 'tokens' and 'labels' columns as lists.
        max_len (int): Maximum sequence length for padding (default 100).

    Returns:
        tuple: Padded sequences of token indices (X) and tag indices (y).
    """
    # Convert tokens to indices
    X = [[token2idx[t] for t in tokens] for tokens in df['tokens']]
    # Convert labels to indices
    y = [[tag2idx[l] for l in labels] for labels in df['labels']]

    # Pad sequences to max_len
    X = pad_sequences(X, maxlen=max_len, padding='post', value=token2idx['<PAD>'])
    y = pad_sequences(y, maxlen=max_len, padding='post', value=tag2idx['<PAD>'])

    return X, y

In [9]:
# Split
from sklearn.model_selection import train_test_split
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42)
X_train, y_train = prepare_sequences(train_df)
X_val, y_val = prepare_sequences(val_df)
X_test, y_test = prepare_sequences(test_df)

In [10]:
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

X_train shape: (35520, 100), y_train shape: (35520, 100)


In [11]:
import torch

# convert to pytorch tensors
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train_tensor = torch.tensor(X_train, dtype=torch.long).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.long).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.long).to(device)

In [13]:
import torch
import torch.nn as nn
import numpy as np
from collections import defaultdict
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.nn import Embedding, GRU

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 1. Optimized Fixed Embeddings (GloVe)
def load_glove_embeddings(glove_file, token2idx, embed_dim=300):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            if word in token2idx:  # Only load relevant words
                vector = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = vector
    vocab_size = len(token2idx)
    embedding_matrix = np.zeros((vocab_size, embed_dim))
    for word, idx in token2idx.items():
        if idx < vocab_size:
            vector = embeddings_index.get(word.lower())
            if vector is not None:
                embedding_matrix[idx] = vector
    return torch.tensor(embedding_matrix, dtype=torch.float32).to(device)

# Load GloVe with token2idx (assuming from notebook)
glove_file = '/content/drive/MyDrive/paper-implementations/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file, token2idx)
print(f"GloVe embedding matrix shape: {glove_embeddings.shape}")

# 2. Optimized Character Embeddings
def create_char_embeddings(tokens, max_char_len=20, char_embed_dim=50):
    char2idx = defaultdict(lambda: len(char2idx))
    char2idx['<PAD>'] = 0
    char_sequences = []
    for token_list in tokens:
        seq = []
        for token in token_list:
            chars = [char2idx[char] for char in token[:max_char_len]]
            seq.extend(chars + [char2idx['<PAD>']] * (max_char_len - len(token)))
        char_sequences.append(seq[:max_char_len * 100])  # Cap at 100 tokens
    char_sequences_padded = pad_sequences(char_sequences, maxlen=max_char_len * 100, padding='post', value=char2idx['<PAD>'])
    char_embed = Embedding(len(char2idx), char_embed_dim).to(device)
    return char_sequences_padded, char_embed

# Apply to tokens from DataFrame
char_sequences, char_embed = create_char_embeddings(df['tokens'])
char_sequences = torch.tensor(char_sequences, dtype=torch.long).to(device)
print(f"Character sequences shape: {char_sequences.shape}, Embedding layer output shape: {char_embed.weight.shape}")

# Verify shapes (assuming from notebook)
print(f"X_train_tensor shape: {X_train_tensor.shape}, y_train_tensor shape: {y_train_tensor.shape}")
print(f"GloVe embeddings shape: {glove_embeddings.shape}")
print(f"Character embeddings shape: {char_sequences.shape}, Character embed weight shape: {char_embed.weight.shape}")

# Clear memory
torch.cuda.empty_cache()
import gc; gc.collect()
print("Memory cleared.")

Using device: cuda
GloVe embedding matrix shape: torch.Size([70717, 300])
Character sequences shape: torch.Size([55500, 2000]), Embedding layer output shape: torch.Size([66, 50])
X_train_tensor shape: torch.Size([35520, 100]), y_train_tensor shape: torch.Size([35520, 100])
GloVe embeddings shape: torch.Size([70717, 300])
Character embeddings shape: torch.Size([55500, 2000]), Character embed weight shape: torch.Size([66, 50])
Memory cleared.


In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchcrf import CRF
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Batched loading
def load_in_batches(tensor, batch_size=16):
    tensor = tensor.to(device)
    for i in range(0, len(tensor), batch_size):
        batch = tensor[i:i + batch_size]
        if len(batch) == 0: continue
        yield batch

# Model definition
class GRUDeidentificationModel(nn.Module):
    def __init__(self, vocab_size, num_tags, embed_dim=150, char_embed_dim=50, hidden_dim=100, dropout=0.3):
        super(GRUDeidentificationModel, self).__init__()

        self.word_embed = nn.Embedding(vocab_size, embed_dim)
        self.word_embed.weight = nn.Parameter(glove_embeddings[:, :embed_dim])
        self.word_embed.weight.requires_grad = True

        self.char_embed = char_embed
        self.char_conv = nn.Conv1d(in_channels=char_embed_dim, out_channels=25, kernel_size=3)

        self.embed_dim = embed_dim + 25

        self.gru = nn.GRU(self.embed_dim, hidden_dim, batch_first=True, bidirectional=True, dropout=dropout)

        self.hidden2tag = nn.Linear(hidden_dim * 2, num_tags)

        self.crf = CRF(num_tags, batch_first=True)

        self.dropout = nn.Dropout(dropout)

    def forward(self, word_ids, char_ids):
        batch_size, seq_len = word_ids.shape
        if batch_size == 0 or seq_len == 0:
            raise ValueError("Invalid input shape: batch or seq_len is 0")

        word_embeds = self.word_embed(word_ids)
        char_embeds = self.char_embed(char_ids[:, :seq_len])
        char_embeds = char_embeds.transpose(1, 2)
        char_features = self.char_conv(char_embeds).max(dim=-1)[0]
        char_features = char_features.unsqueeze(1).expand(-1, seq_len, -1)

        embeds = torch.cat((word_embeds, char_features), dim=-1)
        embeds = self.dropout(embeds)

        gru_out, _ = self.gru(embeds)
        gru_out = self.dropout(gru_out)

        emissions = self.hidden2tag(gru_out)
        return emissions

    def get_loss(self, emissions, labels, mask):
        mask = mask.bool()
        return -self.crf(emissions, labels, mask=mask)

    def decode_tags(self, emissions, mask):
        mask = mask.bool()
        return self.crf.decode(emissions, mask=mask)

# Training and Evaluation
def train_model(model, X_train, y_train, X_val, y_val, char_sequences, epochs=5, batch_size=16, learning_rate=0.001):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        num_batches = 0
        for batch_x, batch_y, batch_char in zip(load_in_batches(X_train), load_in_batches(y_train), load_in_batches(char_sequences)):
            batch_char = batch_char[:, :100]
            mask = (batch_x != 0).to(device)

            optimizer.zero_grad()
            emissions = model(batch_x, batch_char)
            loss = model.get_loss(emissions, batch_y, mask)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            num_batches += 1

        avg_loss = total_loss / num_batches

        # Validation
        model.eval()
        val_preds = []
        val_true = []
        with torch.no_grad():
            for batch_x, batch_y, batch_char in zip(load_in_batches(X_val), load_in_batches(y_val), load_in_batches(char_sequences)):
                batch_char = batch_char[:, :100]
                mask = (batch_x != 0).to(device)
                emissions = model(batch_x, batch_char)
                preds = model.decode_tags(emissions, mask)
                val_preds.extend(preds)
                val_true.extend(batch_y.cpu().numpy().tolist())

        # Flatten and filter padding (assuming 0 is PAD tag)
        flat_preds = [p for seq in val_preds for p in seq if p != tag2idx.get('<PAD>', 0)]
        flat_true = [t for seq in val_true for t in seq if t != tag2idx.get('<PAD>', 0)]  # Removed .item()

        if len(flat_preds) > 0 and len(flat_true) > 0:
            precision, recall, f1, _ = precision_recall_fscore_support(flat_true, flat_preds, average='micro', zero_division=0)
            print(f"Epoch {epoch+1}/{epochs}, Avg Loss: {avg_loss:.4f}, "
                  f"Val Precision: {precision:.4f}, Val Recall: {recall:.4f}, Val F1: {f1:.4f}")
        else:
            print(f"Epoch {epoch+1}/{epochs}, Avg Loss: {avg_loss:.4f}, No valid predictions for metrics.")

# Initialize and train
try:
    model = GRUDeidentificationModel(vocab_size, num_tags).to(device)
    print(f"Model initialized with {sum(p.numel() for p in model.parameters())} parameters.")
    train_model(model, X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor, char_sequences)
except RuntimeError as e:
    print(f"GPU memory error: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")
    raise

Using device: cuda
Model initialized with 10783179 parameters.
Epoch 1/5, Avg Loss: 20.1580, Val Precision: 0.9992, Val Recall: 0.9992, Val F1: 0.9992
Epoch 2/5, Avg Loss: 2.6078, Val Precision: 0.9986, Val Recall: 0.9986, Val F1: 0.9986
Epoch 3/5, Avg Loss: 1.7052, Val Precision: 0.9983, Val Recall: 0.9983, Val F1: 0.9983
Epoch 4/5, Avg Loss: 1.1076, Val Precision: 0.9976, Val Recall: 0.9976, Val F1: 0.9976
Epoch 5/5, Avg Loss: 0.7037, Val Precision: 0.9936, Val Recall: 0.9936, Val F1: 0.9936
