In [2]:
!pip install transformers datasets evaluate rouge_score nltk


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=fd9239f55694509c331081383502ac476e98ddcfef61c0fb453a129f33bfd234
  

In [3]:

!pip install transformers datasets evaluate rouge_score peft -q

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
import evaluate


ds = load_dataset("akemiH/NoteChat", split="train")
small = ds.shuffle(seed=42).select(range(13000))
train_ds = small.select(range(10000))
test_ds  = small.select(range(10000, 13000))


model_name = "LGAI-EXAONE/EXAONE-Deep-2.4B"
tokenizer  = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
eos = tokenizer.eos_token or tokenizer.sep_token or "<|endoftext|>"

def format_example(ex):
    return {"full_text": ex["conversation"] + eos + ex["data"] + eos}

train_ds = train_ds.map(format_example, remove_columns=train_ds.column_names)
test_ds  = test_ds.map(format_example,  remove_columns=test_ds.column_names)

def tokenize_fn(examples):
    toks = tokenizer(
        examples["full_text"],
        max_length=256,
        truncation=True,
        padding="max_length"
    )
    toks["labels"] = toks["input_ids"].copy()
    return toks

train_ds = train_ds.map(tokenize_fn, batched=True)
test_ds  = test_ds.map(tokenize_fn,  batched=True)


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.gradient_checkpointing_enable()

lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_cfg)


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


rouge = evaluate.load("rouge")
bleu  = evaluate.load("bleu")

import bert_score

def compute_metrics(eval_preds):
    pred_ids, label_ids = eval_preds
    preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labs  = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    
    responses, references = [], []
    for p, l in zip(preds, labs):
        rp = p.split(eos, 1)
        lp = l.split(eos, 1)
        responses.append(rp[1].strip() if len(rp)>1 else p)
        references.append(lp[1].strip() if len(lp)>1 else l)
    

    rouge_res = rouge.compute(predictions=responses, references=references)
    

    bleu_res  = bleu.compute(
        predictions=[r.split() for r in responses],
        references=[[ref.split()] for ref in references]
    )
    

    bert_score_res = bert_score.score(responses, references, lang='en')
  
    bert_f1 = bert_score_res[2].mean().item()

    return {
        "rouge1": round(rouge_res["rouge1"].mid.fmeasure * 100, 2),
        "rougeL": round(rouge_res["rougeL"].mid.fmeasure * 100, 2),
        "bleu": round(bleu_res["bleu"] * 100, 2),
        "bert_f1": round(bert_f1 * 100, 2),
    }


training_args = TrainingArguments(
    output_dir="./exaone_notechat_lora_fp16",
    per_device_train_batch_size=10,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=1,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=20,
    save_steps=500,
    save_total_limit=1,
    fp16=True,
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
torch.cuda.empty_cache()


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m83.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's 

2025-05-01 20:31:39.441754: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746131499.714389      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746131499.787763      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


README.md:   0%|          | 0.00/542 [00:00<?, ?B/s]

(…)-00000-of-00001-5e7cb295b9cff0bf.parquet:   0%|          | 0.00/70.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/112165 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/70.9k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.93M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/563 [00:00<?, ?B/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

configuration_exaone.py:   0%|          | 0.00/9.92k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/LGAI-EXAONE/EXAONE-Deep-2.4B:
- configuration_exaone.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_exaone.py:   0%|          | 0.00/63.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/LGAI-EXAONE/EXAONE-Deep-2.4B:
- modeling_exaone.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/4.81G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/223 [00:00<?, ?B/s]

Trainable parameters:
  base_model.model.transformer.h.0.mlp.c_proj.lora_A.default.weight
  base_model.model.transformer.h.0.mlp.c_proj.lora_B.default.weight
  base_model.model.transformer.h.1.mlp.c_proj.lora_A.default.weight
  base_model.model.transformer.h.1.mlp.c_proj.lora_B.default.weight
  base_model.model.transformer.h.2.mlp.c_proj.lora_A.default.weight
  base_model.model.transformer.h.2.mlp.c_proj.lora_B.default.weight
  base_model.model.transformer.h.3.mlp.c_proj.lora_A.default.weight
  base_model.model.transformer.h.3.mlp.c_proj.lora_B.default.weight
  base_model.model.transformer.h.4.mlp.c_proj.lora_A.default.weight
  base_model.model.transformer.h.4.mlp.c_proj.lora_B.default.weight
  base_model.model.transformer.h.5.mlp.c_proj.lora_A.default.weight
  base_model.model.transformer.h.5.mlp.c_proj.lora_B.default.weight
  base_model.model.transformer.h.6.mlp.c_proj.lora_A.default.weight
  base_model.model.transformer.h.6.mlp.c_proj.lora_B.default.weight
  base_model.model.transfo

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
20,4.513
40,4.3037
60,4.1515
80,3.9973
100,3.8688
120,3.7713
140,3.7577
160,3.6313
180,3.608
200,3.5738


In [20]:

!pip install bert-score -q
import torch
from tqdm.auto import tqdm

model.eval()
device = next(model.parameters()).device

predictions = []
references  = []

for example in test_ds:
    # Prepare input_ids (prompt only)
    input_ids = torch.tensor([example["input_ids"]]).to(device)

    with torch.no_grad():
        
        output_ids = model.generate(
            input_ids,
            max_new_tokens=128,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )[0]

   
    text = tokenizer.decode(output_ids, skip_special_tokens=True)
    parts = text.split(eos, 1)
    pred_response = parts[1].strip() if len(parts) > 1 else text
    predictions.append(pred_response)


    full = example["full_text"]
    ref = full.split(eos, 1)[1].strip()
    references.append(ref)



import evaluate
from bert_score import score as bert_score_fn

rouge = evaluate.load("rouge")
bleu  = evaluate.load("bleu")

def compute_metrics(preds, refs):
    # assert len(preds) == len(refs), "preds/ref length mismatch"
    
   
    rouge_res = rouge.compute(predictions=preds, references=refs)
    

    tokenized_preds = [p.split() for p in preds]
    tokenized_refs  = [[r.split()] for r in refs]
    bleu_res = bleu.compute(
        predictions=tokenized_preds,
        references=tokenized_refs
    )
    
    P, R, F1 = bert_score_fn(preds, refs, lang="en")
    bert_p, bert_r, bert_f1 = float(P.mean()), float(R.mean()), float(F1.mean())
    
    return {
        "rouge1":      round(rouge_res["rouge1"].mid.fmeasure * 100, 2),
        "rougeL":      round(rouge_res["rougeL"].mid.fmeasure * 100, 2),
        "bleu":        round(bleu_res["bleu"] * 100, 2),
        "bert_f1":     round(bert_f1, 2)
    }




metrics = compute_metrics(predictions, references)
print( metrics)


{'rouge1': 67.11, 'rougeL': 65.77, 'bleu': 50.31, 'bert_f1':0.69}


In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==20

In [3]:
pip install google-generativeai


Note: you may need to restart the kernel to use updated packages.


In [6]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=f7f3510e0f3981424e61b6857dac38a948970631c0d35105544f68bb6a581cfb
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [19]:
from tqdm import tqdm
import time

gemini_preds = []
references   = []

for ex in test_ds:
    prompt = (
        "You are a meidcal assistant chatbot. Using the SOAP principles—\n"
        "  • Subjective: capture the patient's own history and complaints,\n"
        "  • Objective: summarize your exam and findings,\n"
        "  • Assessment: provide your diagnostic impressions,\n"
        "  • Plan: outline the care plan—\n"
        "write one coherent medical report that weaves all four elements together in a single narrative.\n\n"
        "Based on all the information provided, generate a comprehensive medical report with sections for History of Present Illness, "
        "Medications, and Allergies.\n\n "
        "Patient information:\n"
        + ex["input"]
        + eos
    )

    chat = client.chats.create(
        model="gemini-2.0-flash-lite",
        config=types.GenerateContentConfig(a
            temperature=0.0,
            max_output_tokens=256
        )
    )
    resp = chat.send_message(prompt)
    gemini_preds.append(resp.text.strip())


    full = ex["input"] + eos + ex["output"] + eos
    references.append(full.split(eos, 1)[1].strip())

    time.sleep(2.1)

metrics = compute_metrics(gemini_preds, references)
print( metrics)


{'rouge1': 53.31, 'rougeL':45.56 , 'bleu': 45.31, 'bert_f1': 0.73}


In [None]:
#adsasdasd