In [22]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel

In [23]:

df = pd.read_parquet('mimic_notes_icd_minimal.parquet')

print(df.head())

                                                text icd_code
0  Admission Date:  [**2151-7-16**]       Dischar...    01193
1  Admission Date:  [**2151-7-16**]       Dischar...     4254
2  Admission Date:  [**2151-7-16**]       Dischar...    42731
3  Admission Date:  [**2151-7-16**]       Dischar...     2639
4  Admission Date:  [**2151-7-16**]       Dischar...     2762


In [24]:
df['text'].head(10)

0    Admission Date:  [**2151-7-16**]       Dischar...
1    Admission Date:  [**2151-7-16**]       Dischar...
2    Admission Date:  [**2151-7-16**]       Dischar...
3    Admission Date:  [**2151-7-16**]       Dischar...
4    Admission Date:  [**2151-7-16**]       Dischar...
5    Admission Date:  [**2151-7-16**]       Dischar...
6    Admission Date:  [**2151-7-16**]       Dischar...
7    Admission Date:  [**2151-7-16**]       Dischar...
8    Admission Date:  [**2118-6-2**]       Discharg...
9    Admission Date:  [**2118-6-2**]       Discharg...
Name: text, dtype: object

In [25]:
df['icd_code'].value_counts().head(50)

icd_code
4019     22356
4280     14903
42731    14449
41401    13783
5849      9932
25000     9812
2724      9269
51881     8263
5990      7489
53081     6729
2720      6463
2859      5865
486       5351
2449      5271
2851      5077
496       4906
2762      4809
5070      4247
99592     4084
V5861     4053
0389      4001
311       3667
5859      3608
40390     3601
3051      3582
412       3579
41071     3497
2875      3352
2761      3333
4240      3302
V4581     3239
5119      3152
V1582     2980
V4582     2939
40391     2912
4241      2903
V5867     2725
78552     2709
9971      2696
V290      2693
42789     2669
5845      2617
32723     2502
7742      2494
5180      2468
2760      2465
45829     2341
4168      2341
2767      2317
49390     2302
Name: count, dtype: int64

In [26]:
top_50_codes = df['icd_code'].value_counts().head(50).index

filtered_df = df[df['icd_code'].isin(top_50_codes)]

In [27]:
filtered_df

Unnamed: 0,text,icd_code
2,Admission Date: [**2151-7-16**] Dischar...,42731
4,Admission Date: [**2151-7-16**] Dischar...,2762
5,Admission Date: [**2151-7-16**] Dischar...,5070
6,Admission Date: [**2151-7-16**] Dischar...,5119
10,Admission Date: [**2118-6-2**] Discharg...,51881
...,...,...
692939,"Name: [**Known lastname **],[**Known firstnam...",2724
692941,"Name: [**Known lastname **],[**Known firstnam...",25000
692943,"Name: [**Known lastname **],[**Known firstnam...",45829
692948,"Name: [**Known lastname **],[**Known firstnam...",4241


In [28]:
df

Unnamed: 0,text,icd_code
0,Admission Date: [**2151-7-16**] Dischar...,01193
1,Admission Date: [**2151-7-16**] Dischar...,4254
2,Admission Date: [**2151-7-16**] Dischar...,42731
3,Admission Date: [**2151-7-16**] Dischar...,2639
4,Admission Date: [**2151-7-16**] Dischar...,2762
...,...,...
692961,"Name: [**Known lastname 12459**],[**Known fir...",5533
692962,"Name: [**Known lastname 12459**],[**Known fir...",2853
692963,"Name: [**Known lastname 12459**],[**Known fir...",28749
692964,"Name: [**Known lastname 12459**],[**Known fir...",E9331


In [29]:
# to lower case
filtered_df["text"] = filtered_df["text"].str.lower().str.replace(r"[^a-z0-9\s]", "", regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["text"] = filtered_df["text"].str.lower().str.replace(r"[^a-z0-9\s]", "", regex=True)


In [30]:
filtered_df

Unnamed: 0,text,icd_code
2,admission date 2151716 discharge date ...,42731
4,admission date 2151716 discharge date ...,2762
5,admission date 2151716 discharge date ...,5070
6,admission date 2151716 discharge date ...,5119
10,admission date 211862 discharge date 2...,51881
...,...,...
692939,name known lastname known firstname 133 j ...,2724
692941,name known lastname known firstname 133 j ...,25000
692943,name known lastname known firstname 133 j ...,45829
692948,name known lastname known firstname 133 j ...,4241


In [31]:
# Reset the index after filtering
filtered_df = filtered_df.reset_index(drop=True)
filtered_df

Unnamed: 0,text,icd_code
0,admission date 2151716 discharge date ...,42731
1,admission date 2151716 discharge date ...,2762
2,admission date 2151716 discharge date ...,5070
3,admission date 2151716 discharge date ...,5119
4,admission date 211862 discharge date 2...,51881
...,...,...
256092,name known lastname known firstname 133 j ...,2724
256093,name known lastname known firstname 133 j ...,25000
256094,name known lastname known firstname 133 j ...,45829
256095,name known lastname known firstname 133 j ...,4241


In [32]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(filtered_df["text"], filtered_df["icd_code"], test_size=0.2, random_state=42)


### Prepare DataLoaders
#### we need to map ICD-10 codes to indices for classification

In [34]:
unique_codes = sorted(filtered_df["icd_code"].unique())
code_to_idx = {code: idx for idx, code in enumerate(unique_codes)}
idx_to_code = {idx: code for code, idx in code_to_idx.items()}

# Map original ICD codes to zero-indexed integers
y_train_idx = y_train.map(code_to_idx)
y_val_idx = y_val.map(code_to_idx)


In [33]:
code_to_idx

{'0389': 0,
 '2449': 1,
 '25000': 2,
 '2720': 3,
 '2724': 4,
 '2760': 5,
 '2761': 6,
 '2762': 7,
 '2767': 8,
 '2851': 9,
 '2859': 10,
 '2875': 11,
 '3051': 12,
 '311': 13,
 '32723': 14,
 '4019': 15,
 '40390': 16,
 '40391': 17,
 '41071': 18,
 '412': 19,
 '41401': 20,
 '4168': 21,
 '4240': 22,
 '4241': 23,
 '42731': 24,
 '42789': 25,
 '4280': 26,
 '45829': 27,
 '486': 28,
 '49390': 29,
 '496': 30,
 '5070': 31,
 '5119': 32,
 '5180': 33,
 '51881': 34,
 '53081': 35,
 '5845': 36,
 '5849': 37,
 '5859': 38,
 '5990': 39,
 '7742': 40,
 '78552': 41,
 '99592': 42,
 '9971': 43,
 'V1582': 44,
 'V290': 45,
 'V4581': 46,
 'V4582': 47,
 'V5861': 48,
 'V5867': 49}

### Remap ICD codes

In [37]:
filtered_df["label"] = filtered_df["icd_code"].map(code_to_idx)

In [38]:
filtered_df

Unnamed: 0,text,icd_code,label
0,admission date 2151716 discharge date ...,42731,24
1,admission date 2151716 discharge date ...,2762,7
2,admission date 2151716 discharge date ...,5070,31
3,admission date 2151716 discharge date ...,5119,32
4,admission date 211862 discharge date 2...,51881,34
...,...,...,...
256092,name known lastname known firstname 133 j ...,2724,4
256093,name known lastname known firstname 133 j ...,25000,2
256094,name known lastname known firstname 133 j ...,45829,27
256095,name known lastname known firstname 133 j ...,4241,23


### We'll use the dmis-lab/biobert-base-cased-v1.1 model from HuggingFace.

In [41]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

def tokenize(texts, labels):
    tokens = tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    tokens["labels"] = torch.tensor(labels.values, dtype=torch.long)
    return tokens

### Tokenize data

In [42]:
train_tokens = tokenize(X_train, y_train_idx)
val_tokens = tokenize(X_val, y_val_idx)

### Create DataLoaders

In [None]:

train_dataset = TensorDataset(train_tokens["input_ids"], train_tokens["attention_mask"], train_tokens["labels"])
val_dataset = TensorDataset(val_tokens["input_ids"], val_tokens["attention_mask"], val_tokens["labels"])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

### Neural Network Model with BioBERT

In [16]:
import torch
from torch import nn
from transformers import AutoModel

class BioBERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BioBERTClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0]  # CLS token
        logits = self.classifier(cls_output)
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return loss, logits


### Tokenization should only be on [text]

### Train the model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BioBERTClassifier(num_labels=len(code_to_idx)).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        loss, _ = model(input_ids, attention_mask, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


### Model Evaluation

In [None]:
from sklearn.metrics import classification_report

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        _, logits = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Map back to ICD codes if needed
predicted_codes = [idx_to_code[p] for p in all_preds]
true_codes = [idx_to_code[t] for t in all_labels]

print(classification_report(true_codes, predicted_codes))
