# ICD Code Classifier from MIMIC-III Clinical Notes using NLP and Neural Networks

This notebook demonstrates ICD code classification from clinical notes using the MIMIC-III dataset. It includes baseline models, transformer-based models, prompt learning, knowledge injection, hyperparameter search, and explainability. The codes in MIMIC-III are ICD-9; for ICD-10 tasks, you may map ICD-9 to ICD-10 using an external mapping.


In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import random
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)


  from .autonotebook import tqdm as notebook_tqdm





## 1. Data Loading and Preparation (MIMIC-III)

In [2]:
# Load MIMIC-III notes and diagnoses
DATA_DIR = './mimic-iii-clinical-database-1.4/'
notes = pd.read_csv(DATA_DIR + 'NOTEEVENTS.csv')
diagnoses = pd.read_csv(DATA_DIR + 'DIAGNOSES_ICD.csv')
admissions = pd.read_csv(DATA_DIR + 'ADMISSIONS.csv')

# Merge to get notes and ICD-9 codes per admission
notes = notes[notes['CATEGORY'] == 'Discharge summary']
merged = notes.merge(admissions[['HADM_ID', 'SUBJECT_ID']], on=['HADM_ID', 'SUBJECT_ID'])
merged = merged.merge(diagnoses[['HADM_ID', 'ICD9_CODE']], on='HADM_ID')

df = merged[['TEXT', 'ICD9_CODE']].rename(columns={'TEXT': 'text', 'ICD9_CODE': 'icd_code'})


# Only keep classes with at least 3 samples
code_counts = df['icd_code'].value_counts()
df = df[df['icd_code'].isin(code_counts[code_counts >= 3].index)]

# Stratified sample if needed
if len(df) > 2000:
    df, _ = train_test_split(df, train_size=5000, stratify=df['icd_code'], random_state=SEED)

# Remove classes with <3 samples again (in case sampling reduced some)
code_counts = df['icd_code'].value_counts()
df = df[df['icd_code'].isin(code_counts[code_counts >= 3].index)]

# Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label'] = le.fit_transform(df['icd_code'])

# Double-check
print(df['label'].value_counts().min())  # Should be at least 3
print(f"Number of unique ICD codes: {df['label'].nunique()}")
print(f"Total samples: {len(df)}")
print(df.head())


3
Number of unique ICD codes: 390
Total samples: 3816
                                                     text icd_code  label
286401  Admission Date: [**2111-2-18**]        Dischar...    76527    271
453099  Admission Date:  [**2168-12-29**]             ...    V4986    382
61587   Admission Date:  [**2127-12-16**]             ...    36201    111
674347  Name:  [**Known lastname 1985**],[**Known firs...     3962    113
467364  Admission Date:  [**2141-5-31**]              ...    42731    142


## 2. Data Preprocessing

In [3]:
# Basic text cleaning (customize as needed)
def clean_text(text):
    text = str(text).lower()
    text = text.replace('\n', ' ')
    return text

df['text'] = df['text'].apply(clean_text)

label2code = dict(zip(df['label'], df['icd_code']))
code2label = dict(zip(df['icd_code'], df['label']))

print(df[['text', 'icd_code', 'label']].head())
print(f"Number of unique ICD codes: {len(label2code)}")


                                                     text icd_code  label
286401  admission date: [**2111-2-18**]        dischar...    76527    271
453099  admission date:  [**2168-12-29**]             ...    V4986    382
61587   admission date:  [**2127-12-16**]             ...    36201    111
674347  name:  [**known lastname 1985**],[**known firs...     3962    113
467364  admission date:  [**2141-5-31**]              ...    42731    142
Number of unique ICD codes: 390


In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df['label'])
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=SEED, stratify=train_df['label'])
print(f'Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}')


Train: 2441, Val: 611, Test: 764


## 3. Baseline: TF-IDF + Logistic Regression

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(train_df['text'])
X_val = vectorizer.transform(val_df['text'])
X_test = vectorizer.transform(test_df['text'])

clf = LogisticRegression(max_iter=1000, random_state=SEED)
clf.fit(X_train, train_df['label'])
val_preds = clf.predict(X_val)
test_preds = clf.predict(X_test)

# Get the sorted list of labels present in the test set
labels_in_test = sorted(test_df['label'].unique())
target_names = [str(label2code[i]) for i in labels_in_test]

print(classification_report(
    test_df['label'],
    test_preds,
    labels=labels_in_test,
    target_names=target_names
))

              precision    recall  f1-score   support

       00845       0.00      0.00      0.00         2
        0380       0.00      0.00      0.00         1
       03811       0.00      0.00      0.00         1
       03842       0.00      0.00      0.00         1
        0389       0.00      0.00      0.00         6
       04104       0.00      0.00      0.00         1
       04111       0.00      0.00      0.00         1
       04119       0.00      0.00      0.00         1
        0413       0.00      0.00      0.00         1
        0414       0.00      0.00      0.00         2
        0417       0.00      0.00      0.00         1
       04185       0.00      0.00      0.00         1
         042       0.00      0.00      0.00         1
       07054       0.00      0.00      0.00         2
       07070       0.00      0.00      0.00         1
        1120       0.00      0.00      0.00         1
        1122       0.00      0.00      0.00         1
        1623       0.00    

## 4. Transformer Models (BERT, ClinicalBERT, RoBERTa, Longformer)

In [6]:
import accelerate
print(accelerate.__version__)

1.7.0


In [None]:
# Choose model_name from: 'bert-base-uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'roberta-base', 'allenai/longformer-base-4096'
model_names = [
    # 'bert-base-uncased',
    'emilyalsentzer/Bio_ClinicalBERT',
    'roberta-base',
    'allenai/longformer-base-4096'
]

results = {}
for model_name in model_names:
    print(f'Training model: {model_name}')
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    def tokenize(batch):
        return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=256)
    train_ds = Dataset.from_pandas(train_df[['text', 'label']])
    val_ds = Dataset.from_pandas(val_df[['text', 'label']])
    test_ds = Dataset.from_pandas(test_df[['text', 'label']])
    train_ds = train_ds.map(tokenize, batched=True)
    val_ds = val_ds.map(tokenize, batched=True)
    test_ds = test_ds.map(tokenize, batched=True)
    train_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    val_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    test_ds.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label2code))
    training_args = TrainingArguments(
        output_dir=f'./results_{model_name.replace("/", "_")}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy='epoch',
        save_strategy='no',
        learning_rate=2e-5,
        logging_steps=10,
        seed=SEED,
        load_best_model_at_end=False,
        metric_for_best_model='eval_loss',
        report_to='none'
    )
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=1)
        return {
            'accuracy': accuracy_score(labels, preds),
            'macro_f1': f1_score(labels, preds, average='macro')
        }
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics
    )
    trainer.train()
    val_metrics = trainer.evaluate(val_ds)
    test_metrics = trainer.evaluate(test_ds)
    results[model_name] = {'val': val_metrics, 'test': test_metrics}
    print(f"Validation macro F1: {val_metrics['eval_macro_f1']:.4f}, Test macro F1: {test_metrics['eval_macro_f1']:.4f}")


Training model: emilyalsentzer/Bio_ClinicalBERT


Map: 100%|██████████| 2441/2441 [00:14<00:00, 167.91 examples/s]
Map: 100%|██████████| 611/611 [00:02<00:00, 288.42 examples/s]
Map: 100%|██████████| 764/764 [00:02<00:00, 302.91 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,5.5201,5.495613,0.042553,0.000268
2,5.4868,5.44189,0.042553,0.000268
3,5.603,5.424129,0.042553,0.000268


Validation macro F1: 0.0003, Test macro F1: 0.0002
Training model: roberta-base


Map: 100%|██████████| 2441/2441 [00:09<00:00, 270.44 examples/s]
Map: 100%|██████████| 611/611 [00:02<00:00, 299.77 examples/s]
Map: 100%|██████████| 764/764 [00:02<00:00, 311.07 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,5.4987,5.469385,0.042553,0.000268
2,5.4643,5.426568,0.042553,0.000268
3,5.6229,5.401266,0.042553,0.000281


Validation macro F1: 0.0003, Test macro F1: 0.0002
Training model: allenai/longformer-base-4096


Map: 100%|██████████| 2441/2441 [00:07<00:00, 323.99 examples/s]
Map: 100%|██████████| 611/611 [00:01<00:00, 390.34 examples/s]
Map: 100%|██████████| 764/764 [00:02<00:00, 370.71 examples/s]
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Initializing global attention on CLS token...
Input ids are automatically padded to be a multiple of `config.attention_window`: 512


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,5.5401,5.47509,0.042553,0.000268


## 5. Prompt Learning (Optional, OpenPrompt)

In [None]:
# This section requires OpenPrompt (pip install openprompt)
try:
    from openprompt.data_utils import InputExample
    from openprompt.plms import load_plm
    from openprompt.prompts import ManualTemplate, SoftTemplate, MixedTemplate
    from openprompt.prompts import ManualVerbalizer, SoftVerbalizer
    from openprompt import PromptForClassification, PromptDataLoader
    import torch
    # Prepare data for OpenPrompt
    train_examples = [InputExample(text_a=row['text'], label=int(row['label'])) for _, row in train_df.iterrows()]
    val_examples = [InputExample(text_a=row['text'], label=int(row['label'])) for _, row in val_df.iterrows()]
    test_examples = [InputExample(text_a=row['text'], label=int(row['label'])) for _, row in test_df.iterrows()]
    plm, tokenizer, model_config, WrapperClass = load_plm("bert", "bert-base-uncased")
    template = MixedTemplate(model=plm, tokenizer=tokenizer, text="{'placeholder':'text_a'} It can be classified as {'mask'}.")
    verbalizer = SoftVerbalizer(tokenizer, plm, num_classes=len(label2code))
    prompt_model = PromptForClassification(plm=plm, template=template, verbalizer=verbalizer)
    # DataLoader
    train_dataloader = PromptDataLoader(dataset=train_examples, template=template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass, max_seq_length=256, batch_size=8, shuffle=True, teacher_forcing=False, predict_eos_token=False, truncate_method="head")
    val_dataloader = PromptDataLoader(dataset=val_examples, template=template, tokenizer=tokenizer, tokenizer_wrapper_class=WrapperClass, max_seq_length=256, batch_size=8, shuffle=False, teacher_forcing=False, predict_eos_token=False, truncate_method="head")
    # Training loop (simplified)
    optimizer = torch.optim.AdamW(prompt_model.parameters(), lr=2e-5)
    for epoch in range(3):
        prompt_model.train()
        for batch in train_dataloader:
            optimizer.zero_grad()
            loss = prompt_model(batch)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1} done.")
    # Validation
    prompt_model.eval()
    all_preds, all_labels = [], []
    for batch in val_dataloader:
        logits = prompt_model(batch)
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        labels = batch['label'].cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels)
    print('Prompt Learning Validation F1:', f1_score(all_labels, all_preds, average='macro'))
except ImportError:
    print('OpenPrompt not installed. Skipping prompt learning section.')


## 6. Knowledge Injection (ICD Descriptions, Synonyms, Hierarchy)

In [None]:
# TODO: Load ICD-9/ICD-10 code descriptions, synonyms, and hierarchy
# Example: Use code descriptions as additional features or for retrieval-augmented generation (RAG)
# For demonstration, we'll just print a placeholder
print('Add ICD-9/ICD-10 code descriptions, synonyms, or hierarchy as features or for RAG here.')


## 7. Hyperparameter Search

In [None]:
# Example: Try different learning rates, batch sizes, and dropout rates for the best transformer model
# You can use optuna, Ray Tune, or manual search
# For demonstration, we'll just print a placeholder
print('Implement hyperparameter search for best model performance here.')


## 8. Evaluation and Explainability

In [None]:
# Evaluate best model on test set
# Show classification report, confusion matrix, and per-code F1
# Optionally, visualize attention weights or use LIME/SHAP for explainability
print('Evaluate the best model and add explainability tools here.')


## 9. Summary and Next Steps
- This notebook demonstrated ICD code classification from MIMIC-III clinical notes using baseline and advanced neural NLP models.
- You can extend it with more data, more advanced models (e.g., NoteContrast, GKI-ICD), or more explainability tools.
- For ICD-10 tasks, map ICD-9 codes to ICD-10 using an external mapping.
- For production, consider using larger datasets, more compute, and domain-specific pretraining.
