In [1]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [2]:
from unsloth import FastLanguageModel
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(model_name = "unsloth/llama-3-8b-bnb-4bit", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSN

In [3]:
import os
import re

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def parse_document(document):
    key_value_pattern = re.compile(r'^([^,:]+):,(.*)$')
    section_pattern = re.compile(r'-{50,}')
    data = {}
    for line in document.split('\n'):
        if section_pattern.match(line):
            continue
        elif key_value_pattern.match(line):
            key, val = key_value_pattern.match(line).groups()
            data[key.strip()] = val.strip()
        elif line.strip() and list(data):
            last_key = list(data)[-1]
            data[last_key] += ' ' + line.strip()
    return data

def split_criteria(text):
    return [s.strip() for s in re.split(r'\.\s+', text) if s.strip()]

def process_trial_file(file_path):
    doc = read_file(file_path)
    data = parse_document(doc)
    eligibility = data.get("Eligibility Criteria", "")
    inc, exc = "", ""
    if "||" in eligibility:
        inc, exc = eligibility.split("||")
    else:
        inc = eligibility
    return {
        "name": data.get("Name", "Unnamed Trial"),
        "inclusion": split_criteria(inc.replace("Inclusion:", "").strip()),
        "exclusion": split_criteria(exc.replace("Exclusion:", "").strip())
    }

def load_trials(folder_path):
    return [
        process_trial_file(os.path.join(folder_path, f))
        for f in os.listdir(folder_path)
        if f.endswith(".csv")
    ]

In [4]:
trials = load_trials("trials")
print(trials)

[{'name': 'ASCENT-05', 'inclusion': ['" Age > 18 years, with residual invasive triple negative breast cancer (TNBC) in the breast or lymph nodes after neoadjuvant therapy and surgery: TNBC criteria for the study is defined as estrogen receptor (ER) and progesterone receptor (PR) < 10%, human epidermal growth factor receptor 2 (HER2)-negative per American Society of Clinical Oncology and College of American Pathologists (ASCO/CAP) guidelines (immunohistochemistry (IHC) and/or in situ hybridization (ISH))', 'Adequate excision and surgical removal of all clinically evident of disease in the breast and/or lymph nodes and have adequately recovered from surgery', 'Submission of both pre-neoadjuvant treatment diagnostic biopsy and resected residual invasive disease tissue', 'Eastern Cooperative Oncology Group (ECOG) performance status 0-1', 'Individuals must have received appropriate radiotherapy and have recovered prior to starting study treatment', 'Adequate organ function', 'Key'], 'exclus

In [13]:
from unsloth import FastLanguageModel

max_seq_length = 2048
load_in_4bit = True
dtype = None  # auto

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [43]:
from datasets import Dataset
trials = load_trials("trials")
patient_df = pd.read_csv("balanced_varied_patients.csv")
def build_training_dataset(patient_df, trial):
    training_rows = []

    # Format the trial details once
    trial_name = trial["name"]
    inc = "\n".join(f"- {c}" for c in trial["inclusion"])
    exc = "\n".join(f"- {c}" for c in trial["exclusion"])
    trial_description = f"Trial: {trial_name}\nInclusion Criteria:\n{inc}\nExclusion Criteria:\n{exc}"

    for _, row in patient_df.iterrows():
        label = row["eligibility_label"]

        # Format patient profile (exclude label and ID)
        patient_profile = "\n".join([
            f"{k}: {v}"
            for k, v in row.items()
            if k not in ["eligibility_label", "patient_id", "trial_name"]
        ])

        input_text = f"Patient Profile:\n{patient_profile}\n\n{trial_description}"

        training_rows.append({
            "instruction": "Is this patient eligible for the trial? Respond with 'eligible' or 'not eligible'.",
            "input": input_text,
            "output": label.lower()
        })

    return Dataset.from_list(training_rows)
training_data = build_training_dataset(patient_df, trials[0])

In [44]:
processed_data = [
    {
        "text": f"{ex['instruction']}\n\n{ex['input']}\n\n{ex['output']}{tokenizer.eos_token}"
    }
    for ex in training_data
]


In [45]:
from datasets import Dataset
dataset = Dataset.from_list(processed_data)

In [46]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=100,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="llama3_trial_matcher",
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/200 [00:00<?, ? examples/s]

In [47]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 200 | Num Epochs = 4 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Step,Training Loss
1,0.0095
2,0.0116
3,0.014
4,0.0082
5,0.0215
6,0.0085
7,0.0186
8,0.0059
9,0.0189
10,0.0278


TrainOutput(global_step=100, training_loss=0.014286456541158258, metrics={'train_runtime': 204.3479, 'train_samples_per_second': 3.915, 'train_steps_per_second': 0.489, 'total_flos': 2.4768457581133824e+16, 'train_loss': 0.014286456541158258})

In [48]:
trainer.save_model("llama3_trial_matcher")
tokenizer.save_pretrained("llama3_trial_matcher")

('llama3_trial_matcher/tokenizer_config.json',
 'llama3_trial_matcher/special_tokens_map.json',
 'llama3_trial_matcher/tokenizer.json')

In [55]:
from unsloth import FastLanguageModel
import torch

# --- Model Setup ---
max_seq_length = 1024
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "llama3_trial_matcher",  # Fine-tuned model
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)

# --- Patient Profile ---
patient_profile = """age_over_18: true
triple_negative: true
her2_negative: true
ecog_0_1: true
adequate_organ_function: true
received_radiotherapy: true
no_metastasis: true
previous_cancer: false
cardiac_condition: false
brca_mutation: false
prior_her2_treatment: false
prior_endocrine_therapy: false
active_infection: false
pregnant_or_breastfeeding: false
lvef_below_50: false
arrhythmia: false
ctla4_inhibitor: false
cd137_agent: false
ox40_agent: false
topoisomerase_inhibitor: false
er_positive: true"""

# --- Trial Description (Formatted) ---
trial_description = """Trial: ASCENT-05

Inclusion Criteria:
- Age > 18 years
- Residual invasive triple negative breast cancer (TNBC) in the breast or lymph nodes after neoadjuvant therapy and surgery
- TNBC defined as ER and PR < 10%, and HER2-negative per ASCO/CAP guidelines (IHC/ISH)
- Adequate excision and surgical removal of all clinically evident disease in breast and/or lymph nodes
- Adequately recovered from surgery
- Submission of both pre-neoadjuvant treatment diagnostic biopsy and resected residual invasive disease tissue
- ECOG performance status 0-1
- Received appropriate radiotherapy and recovered before starting study treatment
- Adequate organ function

Exclusion Criteria:
- Stage IV (metastatic) breast cancer
- History of prior (ipsi- or contralateral) invasive breast cancer
- Prior treatment with stimulatory or coinhibitory T-cell receptor agents (e.g., CTLA-4, OX-40, CD137)
- Prior treatment with any HER2-directed agent
- Prior or concurrent treatment with any endocrine therapy agent
- Evidence of recurrent disease following preoperative therapy and surgery
- Prior treatment with topoisomerase 1 inhibitors or ADCs containing a topoisomerase inhibitor
- Individuals with germline BRCA mutations
- Myocardial infarction or unstable angina pectoris within 6 months of enrollment
- History of serious ventricular arrhythmia (ventricular tachycardia or fibrillation)
- High-grade atrioventricular block or other serious cardiac arrhythmias
- Left ventricular ejection fraction (LVEF) < 50%
- Active serious infections requiring antimicrobial therapy"""

# --- Final Prompt ---
instruction = "Is this patient eligible for the trial?. Respond with 'eligible' or 'not eligible' and give a reason."
prompt = f"{instruction}\n\nPatient Profile:\n{patient_profile}\n\n{trial_description}\n\nAnswer:"
# --- Tokenize + Predict ---
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=1024,
    temperature=0.0,
    do_sample=False,
    repetition_penalty=1.1,
    eos_token_id=tokenizer.eos_token_id,
)

# --- Decode and Clean ---
response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower()

if prompt.lower() in response:
    response = response.replace(prompt.lower(), "").strip()

print("🧠 Final output:", response)


==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
🧠 Final output: not eligible


In [None]:
alpaca_prompt = "{}\n\n{}\n\n"

In [26]:
def format_patient_profile(row):
    """
    Converts a patient's feature row (e.g., from a pandas DataFrame)
    into a readable string for model input.
    """
    field_descriptions = {
        "age_over_18": "Age under 18",
        "triple_negative": "Triple negative breast cancer",
        "er_positive": "Estrogen receptor positive",
        "her2_negative": "HER2 negative",
        "ecog_0_1": "ECOG performance status 0 or 1",
        "adequate_organ_function": "Adequate organ function",
        "no_metastasis": "No metastasis",
        "previous_cancer": "History of previous cancer",
        "received_radiotherapy": "Received radiotherapy",
        "cardiac_condition": "Cardiac condition",
        "brca_mutation": "BRCA gene mutation",
        "prior_her2_treatment": "Prior HER2 treatment",
        "prior_endocrine_therapy": "Prior endocrine therapy",
        "active_infection": "Has active infection",
        "pregnant_or_breastfeeding": "Pregnant or breastfeeding",
        "lvef_below_50": "LVEF below 50%",
        "arrhythmia": "Cardiac arrhythmia",
        "ctla4_inhibitor": "Treated with CTLA-4 inhibitor",
        "cd137_agent": "Treated with CD137 agent",
        "ox40_agent": "Treated with OX40 agent",
        "topoisomerase_inhibitor": "Treated with topoisomerase inhibitor",
    }

    lines = []
    for field, description in field_descriptions.items():
        value = row[field]
        status = "Yes" if value else "No"
        lines.append(f"{description}: {status}")

    return "\n".join(lines)


In [27]:
import pandas as pd

df = pd.read_csv("balanced_varied_patients.csv")
patient_row = df.iloc[1]

formatted_input = format_patient_profile(patient_row)
print(formatted_input)

Age under 18: Yes
Triple negative breast cancer: Yes
Estrogen receptor positive: Yes
HER2 negative: Yes
ECOG performance status 0 or 1: Yes
Adequate organ function: Yes
No metastasis: No
History of previous cancer: No
Received radiotherapy: Yes
Cardiac condition: No
BRCA gene mutation: No
Prior HER2 treatment: No
Prior endocrine therapy: No
Has active infection: No
Pregnant or breastfeeding: No
LVEF below 50%: No
Cardiac arrhythmia: No
Treated with CTLA-4 inhibitor: No
Treated with CD137 agent: No
Treated with OX40 agent: No
Treated with topoisomerase inhibitor: No


In [56]:
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm  # Optional: for progress bar

# Load test patients
df = pd.read_csv("balanced_varied_patients2.csv")

# Hold predictions
predictions = []
true_labels = []

# Trial stays fixed if you're only testing ASCENT-05
trial_description = """Trial: ASCENT-05

Inclusion Criteria:
- Age > 18 years
- Residual invasive triple negative breast cancer (TNBC) in the breast or lymph nodes after neoadjuvant therapy and surgery
- TNBC defined as ER and PR < 10%, and HER2-negative per ASCO/CAP guidelines (IHC/ISH)
- Adequate excision and surgical removal of all clinically evident disease in breast and/or lymph nodes
- Adequately recovered from surgery
- Submission of both pre-neoadjuvant treatment diagnostic biopsy and resected residual invasive disease tissue
- ECOG performance status 0-1
- Received appropriate radiotherapy and recovered before starting study treatment
- Adequate organ function

Exclusion Criteria:
- Stage IV (metastatic) breast cancer
- History of prior (ipsi- or contralateral) invasive breast cancer
- Prior treatment with stimulatory or coinhibitory T-cell receptor agents (e.g., CTLA-4, OX-40, CD137)
- Prior treatment with any HER2-directed agent
- Prior or concurrent treatment with any endocrine therapy agent
- Evidence of recurrent disease following preoperative therapy and surgery
- Prior treatment with topoisomerase 1 inhibitors or ADCs containing a topoisomerase inhibitor
- Individuals with germline BRCA mutations
- Myocardial infarction or unstable angina pectoris within 6 months of enrollment
- History of serious ventricular arrhythmia (ventricular tachycardia or fibrillation)
- High-grade atrioventricular block or other serious cardiac arrhythmias
- Left ventricular ejection fraction (LVEF) < 50%
- Active serious infections requiring antimicrobial therapy
"""

# Run prediction per patient
for _, row in tqdm(df.iterrows(), total=len(df)):
    # --- Format patient profile ---
    patient_profile = "\n".join([
        f"{k}: {str(v).lower()}" for k, v in row.items()
        if k not in ["eligibility_label", "patient_id", "trial_name"]
    ])

    # --- Compose prompt ---
    instruction = "Is this patient eligible for the trial? Respond with 'eligible' or 'not eligible'."
    prompt = f"{instruction}\n\nPatient Profile:\n{patient_profile}\n\n{trial_description}\n\nAnswer:"

    # --- Run model ---
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=20,
        temperature=0.0,
        do_sample=False,
        repetition_penalty=1.1,
        eos_token_id=tokenizer.eos_token_id,
    )

    # --- Clean output ---
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower()
    if prompt.lower() in response:
        response = response.replace(prompt.lower(), "").strip()

    # Extract label
    prediction = "eligible" if "eligible" in response and "not" not in response else "not eligible"
    label = row["eligibility_label"].strip().lower()

    predictions.append(prediction)
    true_labels.append(label)

# --- Evaluate ---
print("\n📊 Classification Report:")
print(classification_report(true_labels, predictions, digits=3))


100%|██████████| 100/100 [00:24<00:00,  4.08it/s]


📊 Classification Report:
              precision    recall  f1-score   support

    eligible      1.000     0.240     0.387        50
not eligible      0.568     1.000     0.725        50

    accuracy                          0.620       100
   macro avg      0.784     0.620     0.556       100
weighted avg      0.784     0.620     0.556       100




