In [5]:
!pip install datasets transformers peft bitsandbytes accelerate --quiet

In [6]:
import pandas as pd
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.model_selection import train_test_split
import torch
import wandb

In [7]:
wandb.login()

True

In [8]:
df = pd.read_csv("MedicalTranscription.csv",quotechar='"', on_bad_lines='skip')

df = df[["medical_specialty", "transcription"]]
df = df.dropna()
df = df[df["medical_specialty"].str.len() > 0]
df = df[df["transcription"].str.len() > 100]

In [9]:
df["text"] = "Specialty: " + df["medical_specialty"] + "\n\n" + df["transcription"]
df["label"] = df["medical_specialty"]
df = df[["text", "label"]]


In [10]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

In [11]:
def convert_to_chat_format(df, out_filename):
    data = []
    for _, row in df.iterrows():
        example = {
            "messages": [
                {"role": "system", "content": "You are a helpful medical assistant."},
                {"role": "user", "content": f"Below is a medical note. Please classify it into the correct category.\n\nNote:\n{row['text']}"},
                {"role": "assistant", "content": row["label"]}
            ]
        }
        data.append(example)

    with open(out_filename, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")


In [12]:
print(train_df.columns)


Index(['text', 'label'], dtype='object')


In [13]:
convert_to_chat_format(train_df, "train_chat_format.jsonl")
convert_to_chat_format(test_df, "test_chat_format.jsonl")

In [14]:
with open("train_chat_format.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(line)
        if i == 4:
            break


{"messages": [{"role": "system", "content": "You are a helpful medical assistant."}, {"role": "user", "content": "Below is a medical note. Please classify it into the correct category.\n\nNote:\nSpecialty:  Surgery\n\nPREOPERATIVE DIAGNOSIS:,  Rhabdomyosarcoma of the left orbit.,POSTOPERATIVE DIAGNOSIS:,  Rhabdomyosarcoma of the left orbit.,PROCEDURE: , Left subclavian vein MediPort placement (7.5-French single-lumen).,INDICATIONS FOR PROCEDURE: , This patient is a 16-year-old girl, with newly diagnosed rhabdomyosarcoma of the left orbit.  The patient is being taken to the operating room for MediPort placement.  She needs chemotherapy.,DESCRIPTION OF PROCEDURE: , The patient was taken to the operating room, placed supine, put under general endotracheal anesthesia.  The patient's neck, chest, and shoulders were prepped and draped in usual sterile fashion.  An incision was made on the left shoulder area.  The left subclavian vein was cannulated.  The wire was passed, which was in good po

In [15]:
data_files = {
    "train": "train_chat_format.jsonl",
    "test": "test_chat_format.jsonl"
}


In [16]:
dataset = load_dataset("json", data_files=data_files, split=None)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [17]:
print(f"Train samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")


Train samples: 3936
Test samples: 984


In [18]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [19]:
def preprocess_function(examples):
    def concat_messages(messages):
        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)

    inputs = [concat_messages(m) for m in examples["messages"]]
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512, return_tensors="pt")

    labels = []
    for messages in examples["messages"]:

        assistant_content = messages[-1]["content"]
        label_ids = tokenizer(assistant_content, truncation=True, padding="max_length", max_length=512, return_tensors="pt")["input_ids"]
        labels.append(label_ids)

    padded_labels = torch.cat(labels, dim=0)

    padded_labels[padded_labels == tokenizer.pad_token_id] = -100

    model_inputs["labels"] = padded_labels
    return model_inputs

In [20]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3936 [00:00<?, ? examples/s]

Map:   0%|          | 0/984 [00:00<?, ? examples/s]

In [21]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [22]:
model = get_peft_model(model, lora_config)

In [23]:
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=100,
    logging_steps=50,
    save_total_limit=2,
    learning_rate=3e-4,
    warmup_steps=100,
    fp16=True,
    load_best_model_at_end=True
)

In [24]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [25]:
wandb.init(project="classification-with-llm",name = "fine-tuning-llm")

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [27]:
trainer.train()



Step,Training Loss,Validation Loss
100,1.7661,1.66469
200,1.6228,1.614897
300,1.5567,1.58201
400,1.5915,1.5649
500,1.5303,1.547459


Step,Training Loss,Validation Loss
100,1.7661,1.66469
200,1.6228,1.614897
300,1.5567,1.58201
400,1.5915,1.5649
500,1.5303,1.547459
600,1.4823,1.532966
700,1.453,1.522483
800,1.4498,1.515137
900,1.5114,1.505013
1000,1.5164,1.493754


TrainOutput(global_step=1968, training_loss=1.5120480099344642, metrics={'train_runtime': 2860.5737, 'train_samples_per_second': 1.376, 'train_steps_per_second': 0.688, 'total_flos': 1.2535934251696128e+16, 'train_loss': 1.5120480099344642, 'epoch': 1.0})

In [28]:
trainer.evaluate()

{'eval_loss': 1.4518367052078247,
 'eval_runtime': 100.8721,
 'eval_samples_per_second': 9.755,
 'eval_steps_per_second': 4.877,
 'epoch': 1.0}

In [31]:
wandb.finish()