In [1]:
!pip install transformers



In [2]:
!pip install pandas



In [3]:
!pip install datasets


Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
import numpy as np
from datasets import Features, Value

In [6]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np
import torch
from typing import Optional

# Custom Trainer class to ensure contiguous tensors
class CustomTrainer(Trainer):
    def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
        # Make all model parameters contiguous before saving
        for param in self.model.parameters():
            if not param.is_contiguous():
                param.data = param.data.contiguous()

        # Call the parent class's save_model method
        super().save_model(output_dir, _internal_call)

# Load pre-trained MedBERT model and tokenizer
model_name = "Charangan/MedBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set num_labels according to the number of classes in your readmission risk prediction task
num_labels = 3  # For example: 'low', 'medium', 'high'

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Define paths to datasets
dataset_paths = {
    'admissions': '/content/drive/MyDrive/dataset/admissions.csv',
    'caregiver': '/content/drive/MyDrive/dataset/caregiver.csv',
    'chartevents': '/content/drive/MyDrive/dataset/chartevents.csv',
    'd_hcpcs': '/content/drive/MyDrive/dataset/d_hcpcs.csv',
    'd_icd_diagnoses': '/content/drive/MyDrive/dataset/d_icd_diagnoses.csv',
    'd_icd_procedures': '/content/drive/MyDrive/dataset/d_icd_procedures.csv',
    'd_labitems': '/content/drive/MyDrive/dataset/d_labitems.csv',
    'diagnoses_icd': '/content/drive/MyDrive/dataset/diagnoses_icd.csv',
    'drgcodes': '/content/drive/MyDrive/dataset/drgcodes.csv',
    'emar': '/content/drive/MyDrive/dataset/emar.csv',
    'hcpcsevents': '/content/drive/MyDrive/dataset/hcpcsevents.csv',
    'icustays': '/content/drive/MyDrive/dataset/icustays.csv',
    'labevents': '/content/drive/MyDrive/dataset/labevents.csv',
    'microbiologyevents': '/content/drive/MyDrive/dataset/microbiologyevents.csv',
    'outputevents': '/content/drive/MyDrive/dataset/outputevents.csv',
    'patients': '/content/drive/MyDrive/dataset/patients.csv',
    'pharmacy': '/content/drive/MyDrive/dataset/pharmacy.csv',
    'procedures_icd': '/content/drive/MyDrive/dataset/procedures_icd.csv',
    'services': '/content/drive/MyDrive/dataset/services.csv',
    'transfers': '/content/drive/MyDrive/dataset/transfers.csv',
    'final_features':'/content/drive/MyDrive/dataset/finall_features.csv',
    'target_variables_all_fin':'/content/drive/MyDrive/dataset/target_variables_all_fin.csv'
}

def tokenize_and_format(examples):
    # Construct text from available fields
    text = f"{examples['admission_type']} {examples['admission_location']} {examples['discharge_location']}"

    # Tokenize
    tokenized = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=275,
    )

    # Add a dummy label (replace this with your actual label logic)
    tokenized['labels'] = 0

    return tokenized

# Load and prepare dataset
dataset = load_dataset('csv', data_files={'train': dataset_paths['admissions']}, delimiter=',')

# Print dataset info
print("Dataset info:")
print(dataset)
print("\nDataset columns:", dataset['train'].column_names)

# Prepare the dataset
tokenized_dataset = dataset['train'].map(tokenize_and_format, remove_columns=dataset['train'].column_names)

# Print sample
print("\nSample from tokenized dataset:")
print(tokenized_dataset[0])

# Verify dataset structure
print("\nDataset features:")
print(tokenized_dataset.features)

# Split dataset
train_dataset = tokenized_dataset.shuffle(seed=42).select(range(275))
eval_dataset = tokenized_dataset.shuffle(seed=42).select(range(100))

# Define compute_metrics function
def compute_metrics(p):
    metric = load_metric("accuracy")
    predictions = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=predictions, references=p.label_ids)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Initialize data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize CustomTrainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

# Print sample batch
print("\nSample batch:")
batch = data_collator([train_dataset[i] for i in range(4)])
for k, v in batch.items():
    print(f"{k}: shape {v.shape}, dtype {v.dtype}")

# Train the model
try:
    trainer.train()
except Exception as e:
    print(f"An error occurred during training: {str(e)}")
    print("Input shape:", batch['input_ids'].shape)
    raise

# Evaluate the model
results = trainer.evaluate()
print("Evaluation results:", results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Charangan/MedBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Dataset info:
DatasetDict({
    train: Dataset({
        features: ['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime', 'admission_type', 'admit_provider_id', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'race', 'edregtime', 'edouttime', 'hospital_expire_flag'],
        num_rows: 275
    })
})

Dataset columns: ['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime', 'admission_type', 'admit_provider_id', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'race', 'edregtime', 'edouttime', 'hospital_expire_flag']

Sample from tokenized dataset:
{'input_ids': [101, 158, 2069, 16523, 15681, 157, 9664, 12412, 2271, 9637, 143, 21564, 2107, 145, 9025, 23203, 9159, 2162, 17447, 17656, 17516, 2137, 151, 19556, 13882, 11780, 6820, 19747, 2162, 12150, 3663, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.000307,1.0


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.000307,1.0
2,No log,0.000152,1.0
3,No log,0.000129,1.0


Evaluation results: {'eval_loss': 0.000129148960695602, 'eval_accuracy': 1.0, 'eval_runtime': 85.7589, 'eval_samples_per_second': 1.166, 'eval_steps_per_second': 0.292, 'epoch': 3.0}


In [25]:
# Define a mapping from numeric class labels to descriptive names
class_labels = {
    0: "LOW",
    1: "MEDIUM",
    2: "HIGH"
}

# Example sample input data formatted as a list of dictionaries
sample_inputs = [
    {
        "admission_type": "Emergency",
        "admission_location": "ICU",
        "discharge_location": "Home",
        "age": 70,
        "gender": "female",
        "ethnicity": "Caucasian",
        "marital_status": "single",
        "employment_status": "unemployed",
        "chronic_conditions": ["asthma", "heart disease"],
        "previous_admissions": 5,
        "length_of_stay": 7,  # in days
        "primary_diagnosis": "chronic obstructive pulmonary disease",
        "medications": ["albuterol", "lisinopril"],
        "discharge_disposition": "home",
    }
]

# Tokenize the sample inputs
def tokenize_sample(inputs):
    # Construct text from available fields
    texts = [f"{item['admission_type']} {item['admission_location']} {item['discharge_location']} {item.get('age', '')} {item.get('gender', '')} {item.get('ethnicity', '')} {item.get('marital_status', '')} {item.get('employment_status', '')} {', '.join(item.get('chronic_conditions', []))} {item.get('previous_admissions', '')} {item.get('length_of_stay', '')} {item.get('primary_diagnosis', '')} {', '.join(item.get('medications', []))} {item.get('discharge_disposition', '')}" for item in inputs]

    # Tokenize
    tokenized_samples = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=275,
        return_tensors='pt'
    )
    return tokenized_samples

# Tokenize the sample inputs
tokenized_samples = tokenize_sample(sample_inputs)

# Move tensors to the same device as the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Make predictions
model.eval()
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in tokenized_samples.items()}
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)

# Print predictions with descriptive class names
print("\nPredictions for sample inputs:")
for i, sample in enumerate(sample_inputs):
    predicted_class_index = predictions[i].item()
    predicted_class_name = class_labels.get(predicted_class_index, "Unknown")
    print(f"Sample {i + 1}:")
    print(f"  Input: {sample}")
    print(f"  Predicted class: {predicted_class_name}")



Predictions for sample inputs:
Sample 1:
  Input: {'admission_type': 'Emergency', 'admission_location': 'ICU', 'discharge_location': 'Home', 'age': 70, 'gender': 'female', 'ethnicity': 'Caucasian', 'marital_status': 'single', 'employment_status': 'unemployed', 'chronic_conditions': ['asthma', 'heart disease'], 'previous_admissions': 5, 'length_of_stay': 7, 'primary_diagnosis': 'chronic obstructive pulmonary disease', 'medications': ['albuterol', 'lisinopril'], 'discharge_disposition': 'home'}
  Predicted class: MEDIUM
