In [None]:
# !pip install trl==0.7.4
# !pip install datasets
# !pip install transformers==4.38.2
# !pip install peft==0.10.0
# !pip install accelerate==0.28.0
# test commit

In [3]:
import os
os.environ["HF_HOME"] = r"D:\hf-cache"

In [5]:
import random

import numpy as np
import torch
import pandas as pd

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator,
)

def set_seed(seed_val=42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

#Configuration options
train_batch_size = 16
gradient_accumulation_steps = 1
learning_rate = 1e-5
eval_batch_size = 1
eval_steps = 500
max_input_length = 550
save_steps = 1000
num_train_epochs = 20
random.seed(42)




  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)

from sklearn.model_selection import train_test_split

## Creating the policy model for human Evaluation

In [7]:
df = pd.read_csv("combined_clinical_notes.csv")

# First split: 60% train, 40% temp (eval + test)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)

# Second split: 50% of temp (which is 20% of total) for eval, 50% for test
eval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [8]:
for column in df.columns:
    print(column)  # Prints each column name

dataset
encounter_id
dialogue
note
source_file
id


In [9]:
import json

import pandas as pd
import torch
from datasets import load_dataset
from torch.utils.data import Dataset

# 1) 4-bit quant config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# 2) Load base model in 4-bit
base_model = AutoModelForCausalLM.from_pretrained(
    "HPAI-BSC/Qwen2.5-Aloe-Beta-7B",
    quantization_config=bnb_config,
    device_map="auto",
)
base_model = prepare_model_for_kbit_training(base_model)

tokenizer = AutoTokenizer.from_pretrained("HPAI-BSC/Qwen2.5-Aloe-Beta-7B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# 3) Apply LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], 
)
model = get_peft_model(base_model, lora_config)

# 4) Enable gradient checkpointing
# model.enable_input_require_grads()
# model.gradient_checkpointing_enable()
base_model.config.use_cache = False

# 5) Prepare data with smaller sequence length
# notechat = load_dataset("akemiH/NoteChat")
# trainB = notechat["train"].select(range(3000))
# evalB = notechat["train"].select(range(3000, 3500))

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:09<00:00,  2.29s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
class TLDRDataset(Dataset):
    def __init__(self, train_path, tokenizer, split, max_length=4000):
        self.post_list = []
        dataset = train_df
        self.labels = []

        for sample in dataset.iterrows():
            self.post_list.append(sample[1]["dialogue"])
            self.labels.append(sample[1]["note"])

        self.tokenizer = tokenizer
        self.max_length = max_length
        self.input_ids = []
        self.attn_masks = []

    def __len__(self):
        return len(self.post_list)

    def __getitem__(self, idx):
        conversation = self.post_list[idx]
        summary = self.labels[idx]
        # label = self.labels[idx]

        # encodings_dict = self.tokenizer(txt, truncation=True, max_length=self.max_length, padding="max_length")
        # input_ids = torch.tensor(encodings_dict["input_ids"])
        # attn_masks = torch.tensor(encodings_dict["attention_mask"])
        # return {
        #     "input_ids": input_ids,
        #     "attention_mask": attn_masks,
        #     "labels": labels_ids,
        # }

        txt = f"CONVERSATION:\n{conversation}\n\nSUMMARY: \n{summary}" #IMPORTANT!!!!!!!!!!

        encodings = self.tokenizer(
            txt,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )
         # Labels should be the same as input_ids for causal LM training
        # The model will automatically shift labels internally
        encodings["labels"] = encodings["input_ids"].clone()
        
        return {
            "input_ids": encodings["input_ids"].squeeze(),
            "attention_mask": encodings["attention_mask"].squeeze(),
            "labels": encodings["labels"].squeeze(),
        }

In [17]:
# model.resize_token_embeddings(len(tokenizer))
# model.config.end_token_id = tokenizer.eos_token_id
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.pad_token_id = model.config.eos_token_id

# Disable caching (already done, but double-check)
model.config.use_cache = False

# Enable gradient checkpointing (already done, but confirm)
# model.gradient_checkpointing_enable()

In [19]:
# Set up the datasets
data_path = "NA"
train_dataset = TLDRDataset(
    data_path,
    tokenizer,
    "train",
    max_length=256,
)


In [21]:
for i in train_dataset:
    print(i["input_ids"], i["labels"])
    break

tensor([ 5790, 72226,  3495,   510,    58, 36983,    60,   773, 13580,  1536,
          374,   264,   220,    20,    21,   481,  3157,  6284,  8593,   879,
        18404,  3351, 41026,   315, 16829, 90647,   323,   702,   264,  3267,
         6457,  3840,   315, 19754,   323,  1550,  6543,  7262,   773, 13580,
         1536,  3291,   752,  1128,   594,  2087,   389,   448,   697, 90647,
         3491,   319,    58, 22722,    60,  1632,   432,   594,  1101,  1012,
         2494,   429,   600,   614, 13686,   916,   279,  1537,   498,  1414,
         3040,   311,  4236,  5555,   432,  4977,  1075,  2494,   374,  2677,
        15700,   304,   847, 27591,   323,   498,  1414,  1101,   600, 33390,
          728,   911,   847,  1899,   323,   432, 33390, 81354,   752,   498,
         1414,   264,  2632,  2699,  7025,   264,  2632, 25118,   323,   979,
          600,  1430,   311,  8180,   432,  1101,  4977,  1075,   600,   614,
          311,   600,   653,   308,   944,  1414,   979,   432, 

In [23]:
torch.cuda.set_device(0)

In [25]:
output_dir = r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune-logs"

In [27]:
# Prepare the trainer and start training
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
#     per_device_eval_batch_size=eval_batch_size,
    fp16=True,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    warmup_steps=100,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    dataloader_pin_memory=True,
    dataloader_drop_last=True,
    gradient_checkpointing=True,
)

In [29]:
training_args.device.index

0

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
#     compute_metrics=compute_metrics,
#     data_collator=default_data_collator,
#     preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    tokenizer=tokenizer,
)
trainer.train()
# trainer.save_model(output_dir)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)


Step,Training Loss


  return fn(*args, **kwargs)


TrainOutput(global_step=4, training_loss=2.3734092712402344, metrics={'train_runtime': 111.3858, 'train_samples_per_second': 4.992, 'train_steps_per_second': 0.036, 'total_flos': 5564529698144256.0, 'train_loss': 2.3734092712402344, 'epoch': 1.88})

In [33]:
trainer.save_model(r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune")   ##path to save policy model
tokenizer.save_pretrained(r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune")
model.save_pretrained(r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune-peft-adapaters")  



In [11]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune-logs")
model_path = "HPAI-BSC/Qwen2.5-Aloe-Beta-7B"

tokenizer = AutoTokenizer.from_pretrained(model_path, truncation=True, max_length=256, padding="max_length")
text = train_df.iloc[2]["dialogue"]
tokenized_text = tokenizer(text, return_tensors="pt", max_length=256)
tokenized_text

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:14<00:00,  3.54s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': tensor([[   58, 36983,    60, 15588,  1154, 23902,  1168,   659,  1246,   525,
           498,   937,   319,    58, 22722,    60,   600,  2776,  1661,   659,
          1246,   911,   498,   937,   319,    58, 36983,    60,   600,  2776,
          1661,   659,   773,   512,    12,   525,   498,  5527,   311,   633,
          3855,   937,   319,    58, 22722,    60,   600,  1079, 79141,    58,
         36983,    60, 16910,   659, 23902,  1168,   374,   264,   220,    20,
            21,  4666,  6284,  8593,  1588,   448, 34563, 10072, 14613,   659,
           773,  1154,   600,  3003,  6617,   498,  1033,   304,   279,  2714,
          1154, 23902,  1168,  1154,   323,   807,  1730,   429,   498,  1030,
           264,  3347, 17280, 93755, 79141,    58, 22722,    60, 75446, 79141,
            58, 36983,    60,  1033,   498,  3432,  1045,   294, 96645,   323,
          1045,   326,  1090, 11417,   291,  2090,   937,   319,    58, 22722,
            60,   600,   572,  1602,  

# Policy Model

In [None]:
import os
os.environ["HF_HOME"] = r"D:\hf-cache"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
import torch
import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from trl import RewardTrainer
from datasets import Dataset
import json
import pandas as pd
from transformers import Trainer, TrainingArguments
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
##model path
# MODEL_PATH = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rm_model"
MODEL_PATH = r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune"

In [25]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], 
)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from trl import AutoModelForCausalLMWithValueHead
from transformers import BitsAndBytesConfig  

# ---- Device Setup ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ---- Paths ----
MODEL_PATH = r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune"
PEFT_ADAPTER_PATH = r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune-peft-adapaters"
REF_MODEL_PATH = r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune"

# ---- 1) 4-bit Quantization Configuration ----
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# ---- 2) Load Base Model in 4-bit ----
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=bnb_config,
    device_map="auto",
)
# Prepare the model for k-bit training (this typically freezes most parameters except adapter ones)
base_model = prepare_model_for_kbit_training(base_model)
base_model.gradient_checkpointing_disable()  # Disable checkpointing

# ---- 3) Load Tokenizer ----
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# ---- 4) Load the PEFT Adapter (LoRA) ----
# This reloads your fine-tuned adapter weights onto your base model.
model_with_lora = PeftModel.from_pretrained(base_model, PEFT_ADAPTER_PATH)

# ---- 5) Convert to PPO-Compatible ValueHead Model ----
# When converting, pass the peft_config from the adapter model to ensure proper initialization.
ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    model_with_lora,
    peft_config=lora_config
).to(device)

# ---- 6) Optionally, Load a Reference Model for KL (e.g., reward model) ----
ppo_model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(
    model_with_lora,
).to(device)
ppo_model_ref.eval()  # Disable dropout/etc
for param in ppo_model_ref.parameters():
    param.requires_grad = False  # Freeze all



Using device: cuda


Loading checkpoint shards: 100%|███████████████████████████████████████████████| 4/4 [00:07<00:00,  1.90s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# starcoder_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) ## tokenizer of step 1 model., here since we are using same model for step 1 and 2 it doesnot matter
# starcoder_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tok = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
tok.pad_token = tok.eos_token
tok.padding_side = "left"
model.config.pad_token_id = tok.eos_token_id
model.resize_token_embeddings(len(tok))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [13]:
sentiment_pipe_kwargs = {"top_k": None, "function_to_apply": "none"}
project_kwargs={"logging_dir": r"D:\kshitij-weights-folder\qwen-aloe-9-4-rl-logs"}

config = PPOConfig(
    model_name=MODEL_PATH, ppo_epochs=1, project_kwargs=project_kwargs, gradient_accumulation_steps=2, steps=5, batch_size=2, mini_batch_size=1, learning_rate=2e-5, log_with='tensorboard')

In [None]:
import torch
import bitsandbytes as bnb
# optimizer = torch.optim.SGD(starcoder_model.parameters(), lr=config.learning_rate)

optimizer = bnb.optim.Adam8bit(ppo_model.parameters(), lr=config.learning_rate)

ppo_trainer = PPOTrainer(config, ppo_model,ppo_model_ref, tok, optimizer=optimizer)

In [None]:
from torch.utils.data import Dataset, DataLoader

df = pd.read_csv("combined_clinical_notes.csv")[["dialogue", "note"]]

class ClinDS(Dataset):
    def __init__(self, df, tok, max_len=512):
        self.df = df.reset_index(drop=True)
        self.tok = tok
        self.L = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        conv = str(self.df.iloc[i]["dialogue"])
        ref = str(self.df.iloc[i]["note"])
        prompt = f"Summarize the following conversation:\n\n{conv}"
        enc = self.tok(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=self.L,
            return_tensors="pt",
        )
        return {
            "input_ids": enc.input_ids.squeeze(),
            "attention_mask": enc.attention_mask.squeeze(),
            "src_txt": prompt,
            "ref_txt": ref,
        }

loader = DataLoader(
    ClinDS(df.sample(200, random_state=0), tok),
    batch_size=1, shuffle=True, pin_memory=True, drop_last=True
)

In [39]:
from random import choices
from tqdm import tqdm
import time
import numpy as np

In [None]:
gen_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tok.eos_token_id,
    "max_new_tokens": 32,
    "eos_token_id": -1,
}

In [61]:
MEDICAL_PROMPT = """
Please generate a medical summary based on the following clinical notes. The summary should include the following sections: 

CHIEF COMPLAINT
A concise statement of the patient's primary concern or reason for visiting the clinic.

HISTORY OF PRESENT ILLNESS
A detailed narrative about the patient's symptoms, their onset, duration, and any relevant medical history or previous treatments.

VITALS
Include any relevant vital signs (e.g., oxygen saturation, blood pressure) if available.

PHYSICAL EXAM 
Summarize the findings from the physical examination, including any notable abnormalities.

RESULTS 
Summarize the results of any diagnostic tests performed (e.g., lab work, imaging studies).

ASSESSMENT
The doctor's assessment of the patient's condition or diagnosis.

PLAN
The treatment plan, including prescribed medications, lifestyle recommendations, and follow-up instructions.

INSTRUCTIONS
Specific instructions for the patient regarding their treatment plan and follow-up care.

Important Note: If any section lacks relevant information, omit that section from the generated summary. Only include sections for which there is sufficient information available.
"""

In [None]:
import sys
sys.path.insert(0, r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")

from utils import convert_to_json
from metric.evaluator import get_evaluator

sum_eval = get_evaluator("summarization", device="cpu")

@torch.inference_mode()
def unieval_4way(src, hyp, ref):
    """
    src, hyp, ref: lists of strings, length B
    returns: Tensor (B,4) with [coherence, consistency, fluency, relevance]
    """
    data = convert_to_json(
        output_list=hyp,
        src_list=src,
        ref_list=ref,
    )
    raw = sum_eval.evaluate(data, print_result=True)
    scores = [
        [d["coherence"], d["consistency"], d["fluency"], d["relevance"]]
        for d in raw
    ]
    return torch.tensor(scores, dtype=torch.float32)  # CPU (B,4)

In [None]:
from random import choices
from tqdm import tqdm
import time
import numpy as np

for epoch in range(1):
    for batch_idx, batch in enumerate(loader):
        # Prepare inputs
        ids = batch["input_ids"].to(DEVICE)
        attn_mask = batch["attention_mask"].to(DEVICE)
        src_txt = batch["src_txt"]  # list[str]
        ref_txt = batch["ref_txt"]  # list[str]

        # Generate multiple candidates per prompt
        NUM_CANDIDATES = 1
        all_outs = []
        for _ in range(NUM_CANDIDATES):
            with torch.no_grad():
                # original_notes = src_txt
            
                # # Combine with medical prompt only during generation
                # full_prompt = original_notes + "\n\n" + MEDICAL_PROMPT
                # full_prompt_tensor = tok.encode(full_prompt, return_tensors="pt").to(device).squeeze(0)
                
                out = ppo_model.generate(
                    input_ids=ids,
                    attention_mask=attn_mask,
                    **gen_kwargs
                )
            all_outs.append(out)

        # Stack outputs (B, K, L)
        outs = torch.stack(all_outs, dim=1)

        # Decode all candidates
        hyps = [
            [tok.decode(outs[b, k], skip_special_tokens=True)
            for k in range(NUM_CANDIDATES)]
            for b in range(outs.size(0))
        ]

        # Compute rewards using UniEval and dominance scoring
        rewards = []
        for b in range(len(src_txt)):
            # Get scores for all candidates (K, 4)
            scores = unieval_4way(
                [src_txt[b]] * NUM_CANDIDATES,
                hyps[b],
                [ref_txt[b]] * NUM_CANDIDATES
            ).numpy()

            # Compute dominance counts
            dom_counts = np.zeros(NUM_CANDIDATES)
            for i in range(NUM_CANDIDATES):
                for j in range(NUM_CANDIDATES):
                    if i == j:
                        continue
                    # Check if i dominates j
                    if np.all(scores[i] >= scores[j]) and np.any(scores[i] > scores[j]):
                        dom_counts[i] += 1

            # Normalize to [-1, 1]
            max_dom = NUM_CANDIDATES - 1
            scalar_rewards = 2 * (dom_counts / max_dom) - 1
            rewards.append(scalar_rewards)  

        # Flatten for PPO
        flat_queries = []
        flat_responses = []
        flat_rewards = []

        for b in range(len(src_txt)):
            for k in range(NUM_CANDIDATES):
                flat_queries.append(ids[b])
                flat_responses.append(outs[b, k])
                flat_rewards.append(torch.tensor([rewards[b][k]], device=DEVICE))

        # PPO step
        stats = ppo_trainer.step(
            queries=flat_queries,
            responses=flat_responses,
            scores=flat_rewards
        )

        # Logging
        if batch_idx % 10 == 0:
            print(f"Epoch {epoch+1}, Batch {batch_idx}")
            print(f"Sample output: {hyps[0][0][:100]}...")
            print(f"Average reward: {np.mean([r.item() for r in flat_rewards]):.4f}")

    print(f"✅ Epoch {epoch+1}/3 complete")

print("🎉 PPO fine-tuning done")

  0%|                                                                                 | 0/46 [00:00<?, ?it/s]

check




Evaluating coherence of 2 samples !!!



100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.17it/s][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                 | 0/13 [00:00<?, ?it/s][A
 15%|███████████▏                                                             | 2/13 [00:00<00:00, 12.74it/s][A
 31%|██████████████████████▍                                                  | 4/13 [00:00<00:00, 13.25it/s][A
 46%|█████████████████████████████████▋                                       | 6/13 [00:00<00:00, 13.34it/s][A
 62%|████████████████████████████████████████████▉                            | 8/13 [00:00<00:00, 13.39it/s][A
 77%|███████████████████████████████████████████████████████▍                | 10/13 [00:00<00:00, 13.45it/s][A
100%|████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 13.56it/s][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                 | 0/13 [00:00<?, ?it/s][A
 15%|███████████▏                                                             | 2/13 [00:00<00:00, 13.37it/s][A
100%|████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 41.50it/s][A


Evaluating factual consistency of 2 samples !!!
Evaluating consistency of 2 samples !!!



  0%|                                                                                 | 0/13 [00:00<?, ?it/s][A
  8%|█████▌                                                                   | 1/13 [00:00<00:08,  1.38it/s][A
 15%|███████████▏                                                             | 2/13 [00:01<00:08,  1.37it/s][A
 23%|████████████████▊                                                        | 3/13 [00:02<00:07,  1.36it/s][A
 31%|██████████████████████▍                                                  | 4/13 [00:02<00:06,  1.38it/s][A
 38%|████████████████████████████                                             | 5/13 [00:03<00:05,  1.38it/s][A
 46%|█████████████████████████████████▋                                       | 6/13 [00:04<00:05,  1.39it/s][A
 54%|███████████████████████████████████████▎                                 | 7/13 [00:05<00:04,  1.39it/s][A
 62%|████████████████████████████████████████████▉                            | 8/13 [00:05<00:

check
Evaluating coherence of 2 samples !!!



100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 23.73it/s][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                 | 0/19 [00:00<?, ?it/s][A
  5%|███▊                                                                     | 1/19 [00:00<00:15,  1.14it/s][A
 11%|███████▋                                                                 | 2/19 [00:01<00:14,  1.13it/s][A
 16%|███████████▌                                                             | 3/19 [00:02<00:14,  1.14it/s][A
 21%|███████████████▎                                                         | 4/19 [00:03<00:13,  1.14it/s][A
 26%|███████████████████▏                                                     | 5/19 [00:04<00:12,  1.14it/s][A
 32%|███████████████████████                                                  | 6/19 [00:05<00:11,  1.14it/s][A
 37%|██████████████████████████▉                                              | 7/19 [00:06<00:10,  1.14it/s][A
 42%|██████████████████████████████▋                                          | 8/19 [00:07<00:

Evaluating fluency of 2 samples !!!



  0%|                                                                                 | 0/19 [00:00<?, ?it/s][A
  5%|███▊                                                                     | 1/19 [00:00<00:15,  1.14it/s][A
 11%|███████▋                                                                 | 2/19 [00:01<00:14,  1.14it/s][A
 32%|███████████████████████                                                  | 6/19 [00:01<00:02,  4.50it/s][A
 47%|██████████████████████████████████▌                                      | 9/19 [00:01<00:01,  7.15it/s][A
 68%|█████████████████████████████████████████████████▎                      | 13/19 [00:02<00:00, 11.45it/s][A
100%|████████████████████████████████████████████████████████████████████████| 19/19 [00:02<00:00,  8.38it/s][A


Evaluating factual consistency of 2 samples !!!
Evaluating consistency of 2 samples !!!



  0%|                                                                                 | 0/19 [00:00<?, ?it/s][A
  5%|███▊                                                                     | 1/19 [00:01<00:25,  1.43s/it][A
 11%|███████▋                                                                 | 2/19 [00:09<01:33,  5.48s/it][A
 16%|███████████▌                                                             | 3/19 [00:11<00:57,  3.60s/it][A
 21%|███████████████▎                                                         | 4/19 [00:16<01:05,  4.39s/it][A
 26%|███████████████████▏                                                     | 5/19 [00:19<00:51,  3.66s/it][A
 32%|███████████████████████                                                  | 6/19 [00:21<00:43,  3.34s/it][A
 37%|██████████████████████████▉                                              | 7/19 [00:24<00:36,  3.04s/it][A
 42%|██████████████████████████████▋                                          | 8/19 [00:30<00:

check
Evaluating coherence of 2 samples !!!



100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.00it/s][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                 | 0/13 [00:00<?, ?it/s][A
  8%|█████▌                                                                   | 1/13 [00:00<00:10,  1.12it/s][A
 15%|███████████▏                                                             | 2/13 [00:01<00:09,  1.13it/s][A
 23%|████████████████▊                                                        | 3/13 [00:02<00:08,  1.13it/s][A
 31%|██████████████████████▍                                                  | 4/13 [00:03<00:07,  1.14it/s][A
 38%|████████████████████████████                                             | 5/13 [00:04<00:07,  1.14it/s][A
 46%|█████████████████████████████████▋                                       | 6/13 [00:05<00:06,  1.14it/s][A
 54%|███████████████████████████████████████▎                                 | 7/13 [00:06<00:05,  1.14it/s][A
 62%|████████████████████████████████████████████▉                            | 8/13 [00:07<00:

Evaluating fluency of 2 samples !!!



  0%|                                                                                 | 0/13 [00:00<?, ?it/s][A
  8%|█████▌                                                                   | 1/13 [00:00<00:10,  1.14it/s][A
 46%|█████████████████████████████████▋                                       | 6/13 [00:00<00:00,  7.87it/s][A
100%|████████████████████████████████████████████████████████████████████████| 13/13 [00:02<00:00,  6.44it/s][A


Evaluating factual consistency of 2 samples !!!
Evaluating consistency of 2 samples !!!



  0%|                                                                                 | 0/13 [00:00<?, ?it/s][A
  8%|█████▌                                                                   | 1/13 [00:01<00:17,  1.43s/it][A
 15%|███████████▏                                                             | 2/13 [00:09<01:00,  5.47s/it][A
 23%|████████████████▊                                                        | 3/13 [00:11<00:35,  3.60s/it][A
 31%|██████████████████████▍                                                  | 4/13 [00:16<00:39,  4.39s/it][A
 38%|████████████████████████████                                             | 5/13 [00:19<00:29,  3.66s/it][A
 46%|█████████████████████████████████▋                                       | 6/13 [00:21<00:23,  3.34s/it][A
 54%|███████████████████████████████████████▎                                 | 7/13 [00:24<00:18,  3.04s/it][A
 62%|████████████████████████████████████████████▉                            | 8/13 [00:30<00:

In [None]:
###saving the model
# starcoder_model.save_pretrained("rhlfmodel/")
# starcoder_tokenizer.save_pretrained("rhlfmodel/")

ppo_trainer.model.pretrained_model.save_pretrained("D:\kshitij-weights-folder\qwen-aloe-rl-12-4-ppo-tuned")
tok.save_pretrained("D:\kshitij-weights-folder\qwen-aloe-rl-12-4-ppo-tuned")

if isinstance(ppo_trainer.model.pretrained_model, PeftModel):
    ppo_trainer.model.pretrained_model.save_adapter(
        "D:/kshitij-weights-folder/qwen-aloe-rl-12-4-ppo-tuned-lora",
        "lora_adapter"
    )

### Testing

In [1]:
import os
os.environ["HF_HOME"] = r"D:\hf-cache"

In [3]:
import torch
import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from trl import RewardTrainer
from datasets import Dataset
import json
import pandas as pd
from transformers import Trainer, TrainingArguments
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from transformers import BitsAndBytesConfig  
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
DATA = r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv"
df = pd.read_csv(DATA)

# First split: 60% train, 40% temp (eval + test)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)

# Second split: 50% of temp (which is 20% of total) for eval, 50% for test
eval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
train_df.head(1)

Unnamed: 0,dataset,encounter_id,dialogue,note,source_file,id
246,aci,,[doctor] so tyler is a 56 -year-old male who p...,SUBJECTIVE\r\n\r\nDifficulty swallowing. Tyler...,src_experiment_data\test1_aci_asrcorr.csv,ACI084


In [7]:
# -- Path to your LoRA weights + tokenizer --
model_dir = "D:\kshitij-weights-folder\qwen-aloe-rl-9-4-ppo-tuned"  

# -- 4-bit quantization config (same as training) --
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# -- 1) Load the *base* Qwen2.5 model in 4-bit --
base_model = AutoModelForCausalLM.from_pretrained(
    "HPAI-BSC/Qwen2.5-Aloe-Beta-7B",
    quantization_config=bnb_config,
    device_map="auto",
)

# model = AutoModelForCausalLM.from_pretrained(base_model_name)
# model = PeftModel.from_pretrained(model, adapter_model_name)

# -- 2) Load your fine-tuned LoRA adapters into the base model --
# The directory should contain adapter_model.bin, adapter_config.json, etc.
lora_model = PeftModel.from_pretrained(
    base_model, 
    model_dir,
    device_map="auto",
)

# -- 3) Load the tokenizer you saved to ./aloe_qwen --
tokenizer = AutoTokenizer.from_pretrained(model_dir)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.85s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
def return_prompt(conv):
    prompt = f"""
    CONVERSATION:
    {conv}
    
    SUMMARY:
    Please generate a medical summary based on the following clinical notes. The summary should include the following sections: Chief Complaint, History of Present Illness, Vitals, Physical Exam, Results, Assessment, Plan, and Instructions.
    Please format the response as plain text, without using markdown or special formatting, and with clear headings for each section, like this:
    
    
    CHIEF COMPLAINT
    A concise statement of the patient's primary concern or reason for visiting the clinic.
    
    HISTORY OF PRESENT ILLNESS
    A detailed narrative about the patient's symptoms, their onset, duration, and any relevant medical history or previous treatments.
    
    VITALS
    Include any relevant vital signs (e.g., oxygen saturation, blood pressure) if available.
    
    PHYSICAL EXAM 
    Summarize the findings from the physical examination, including any notable abnormalities.
    
    RESULTS 
    Summarize the results of any diagnostic tests performed (e.g., lab work, imaging studies).
    
    ASSESSMENT
    The doctor's assessment of the patient's condition or diagnosis.
    
    PLAN
    The treatment plan, including prescribed medications, lifestyle recommendations, and follow-up instructions.
    
    INSTRUCTIONS
    Specific instructions for the patient regarding their treatment plan and follow-up care.
    
    Important Note: If any section lacks relevant information or if specific details are not provided (e.g., vitals are not mentioned, no abnormal findings in the physical exam), omit that section from the generated summary. Only include sections for which there is sufficient information available.
    """

    return prompt

In [21]:
eval_df = test_df

eval_df.head(1)

Unnamed: 0,dataset,encounter_id,dialogue,note,source_file,id
172,aci,D2N053,[doctor] so barbara i i know you are here for ...,CHIEF COMPLAINT\r\n\r\nItchy scalp pain.\r\n\r...,challenge_data\train.csv,


In [25]:
from transformers import pipeline
summarizer = pipeline(
    "text-generation",
    model=lora_model,
    tokenizer=tokenizer,
    device_map="auto",  # Automatically place on GPU if available
)

# text_gen_pipeline = pipeline(
#     "text-generation",
#     model=lora_model,
#     tokenizer=tokenizer,
#     device_map="auto",   # place on GPU if available
# )

# Now generate text using the pipeline
# response = text_gen_pipeline(
#     prompt,
#     max_new_tokens=900,
#     do_sample=False,  # Deterministic for demonstration
# )

# notechat = load_dataset("akemiH/NoteChat")
# eval_df = notechat["train"].select(range(5000, 7001)).to_pandas()
eval_df = test_df[:20]

# 2) Clean up missing data if present
# if eval_df.isnull().values.any():
#     print("Found missing values in the evaluation set. Dropping them.")
#     eval_df = eval_df.dropna()


# 3) Prepare batching parameters
batch_size = 4
num_samples = len(eval_df)
num_batches = (num_samples // batch_size) + int(num_samples % batch_size != 0)

# Lists to store predictions and references
predictions = []
references = []

# 4) Batching loop to generate summaries
for i in tqdm(range(num_batches), desc="Generating Summaries"):
    start = i * batch_size
    end = min(start + batch_size, num_samples)
    
    # Extract conversation and reference summary columns
    batch_conversations = eval_df["dialogue"][start:end].tolist()
    batch_refs = eval_df["note"][start:end].tolist()  # "data" column for references

    # Prepare prompts replicating training format
    prompts = [
        return_prompt(conv)
        for conv in batch_conversations
    ]
    
    # Generate summaries
    results = summarizer(
        prompts,
        max_new_tokens=900,
        do_sample=False,
        # truncation=True,
        # num_return_sequences=1  # Default is 1
    )
    
    # Parse results and extract summaries
    for item in results:
        # 'item' is a list with 1 dict => {"generated_text": "..."}
        output_dict = item[0]
        generated_text = output_dict["generated_text"]
        
        # Extract only the part after "SUMMARY:"
        # if "SUMMARY:" in full_text:
        #     extracted_summary = generated_text.split("SUMMARY:", 1)[-1].strip()
        # else:
        #     extracted_summary = generated_text  # Fallback if marker not found

        # first_occurrence = generated_text.lower().find("chief complaint")
        # second_occurrence = generated_text.lower().find("chief complaint", first_occurrence + 1)
        # third_occurrence = generated_text.lower().find("chief complaint", second_occurrence + 1)
        # fourth_occurrence = generated_text.lower().find("chief complaint", third_occurrence + 1)
        
        # if fourth_occurrence != -1:
        #     # Extract everything from the second occurrence of "CHIEF COMPLAINT" onward
        #     extracted_summary = generated_text[fourth_occurrence:].strip()
        # else:
        #     # If the second "CHIEF COMPLAINT" is not found, just use the original text
        #     extracted_summary = generated_text.strip()
        
        predictions.append(generated_text)
    
    # Append the reference texts
    references.extend(batch_refs)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCa

In [27]:
src_list = eval_df["dialogue"].tolist()
ref_list = eval_df["note"].tolist()

In [31]:
output_list = []
for pred in predictions:
    # Ensure that "Summary:" exists in the string to avoid errors
    if len(pred) > 0:
        output_list.append(pred)
    else:
        # Handle cases where "Summary:" is missing (optional)
        output_list.append("")  # Or handle differently based on your needs

In [37]:
data = convert_to_json(
    src_list=src_list,
    ref_list=ref_list,
    output_list=output_list
)

filtered_data = [
    entry for entry in data
    if entry["system_output"].strip()  # Ensure non-empty system_output
]



In [51]:
with open("nlg_evaluation_data.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

In [None]:
from transformers import pipeline, set_seed
model_path = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel"
set_seed(42)
pipe = pipeline("text-generation",model=model_path, tokenizer=model_path, max_length=40, num_return_sequences=1)

In [None]:
text = dataset["rejected"][0]
print(text)
pipe(text)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


TL;DR:  My girlfriend and I broke up after she went through my Facebook account without my permission.<|endoftext|>Citizens for the Republic


[{'generated_text': 'TL;DR:  My girlfriend and I broke up after she went through my Facebook account without my permission.<|endoftext|>Citizens for the Republic'}]

In [None]:
save_directory = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel"

# Load the model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(save_directory).to(device)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

In [None]:
conversation = '''
Doctor: Hi, Mr. X, I'm Dr. Y. How are you feeling today?

Patient: Not too good, doctor. I've been feeling really sick lately.

Doctor: I understand. Can you tell me what symptoms you're experiencing?

Patient: Yes, I've been having a fever, a dry cough, and dyspnea.

Doctor: I see. You were hospitalized due to moderate ARDS from COVID-19, is that correct?

Patient: Yes, that's correct.

Doctor: During your physical therapy, we encountered some difficulties. Can you tell me more about that?

Patient: Yes, I had trouble with position changes and deep breathing. Every time I tried to change my position or take a deep breath, I would start coughing and it would make me really short of breath.

Doctor: I understand. To avoid rapid deterioration and respiratory failure, we instructed you to change positions very slowly and step-by-step, right?

Patient: Yes, that's right. It took about 30 minutes to change to the prone position.

Doctor: And I see that this approach increased your oxygen saturation, for example, on day 5 with 6 L/min of oxygen from 93% to 97%.

Patient: Yes, that's correct.

Doctor: Good. We also had to adapt your breathing exercises to avoid prolonged coughing and oxygen desaturation. Can you tell me more about that?

Patient: Yes, I was instructed to stop every deep breath before coughing and to hold my breath for better air distribution.

Doctor: I see that you performed the breathing exercises well and managed to increase your oxygen saturation.

Patient: Yes, I did my best.

Doctor: You also had difficulty maintaining sufficient oxygen saturation during physical activity, is that correct?

Patient: Yes, I did. But with close monitoring and frequent breaks, I was able to perform low-level strength and walking exercises without any significant deoxygenation.

Doctor: I see that your exercise progression was low on days 1 to 5, but then increased daily until your hospital discharge to a rehabilitation clinic on day 10.

Patient: Yes, that's correct.

Doctor: Great. I'd like to keep monitoring your progress and see how you're doing. Can you keep me updated on any changes in your symptoms?

Patient: Yes, of course, doctor.

Doctor: Alright, let's keep in touch. If you have any questions or concerns, don't hesitate to reach out to me.

Patient: Thank you, doctor.
'''

In [None]:
def generate_response(prompt, model, tokenizer, max_new_tokens=1000, temperature=0.1):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            temperature=temperature,
            top_k=50,
            top_p=0.9,
            do_sample=True,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id
        )


    return tokenizer.decode(output[0], skip_special_tokens=True)

prompt = "Generate a summary for the below conversation. Dont give me the prompt back. I just want the summary to be returned to me\n\n" + conversation
response = generate_response(prompt, model, tokenizer)
print("Generated Response:\n", response)


Generated Response:
 Generate a summary for the below conversation. Dont give me the prompt back. I just want the summary to be returned to me


Doctor: Hi, Mr. X, I'm Dr. Y. How are you feeling today?

Patient: Not too good, doctor. I've been feeling really sick lately.

Doctor: I understand. Can you tell me what symptoms you're experiencing?

Patient: Yes, I've been having a fever, a dry cough, and dyspnea.

Doctor: I see. You were hospitalized due to moderate ARDS from COVID-19, is that correct?

Patient: Yes, that's correct.

Doctor: During your physical therapy, we encountered some difficulties. Can you tell me more about that?

Patient: Yes, I had trouble with position changes and deep breathing. Every time I tried to change my position or take a deep breath, I would start coughing and it would make me really short of breath.

Doctor: I understand. To avoid rapid deterioration and respiratory failure, we instructed you to change positions very slowly and step-by-step, right?

Pat

In [None]:
model = AutoModelForCausalLM.from_pretrained("rlhfmodel/")
model_path = "bigcode/tiny_starcoder_py"

tokenizer = AutoTokenizer.from_pretrained(model_path, truncation=True, max_length=256, padding="max_length")
text = df.iloc[2]["prompt"]
tokenized_text = tokenizer(text, return_tensors="pt", max_length=256)

In [47]:
data = [
    {
        'source': "Doctor: Hello, how are you feeling today?\nPatient: I've been feeling a bit tired and dizzy.\nDoctor: How long has this been happening?\nPatient: For about a week now. I also have trouble sleeping.\nDoctor: I see. Have you been under a lot of stress lately?\nPatient: Yes, work has been quite stressful.\nDoctor: That could be contributing. Let’s do some tests to rule out other issues.",
        'system_output': "Patient reports tiredness, dizziness, and difficulty sleeping for a week. Work-related stress may be a factor. Doctor will conduct tests to check for other problems."
    },
    {
        'source': "Doctor: What brings you in today?\nPatient: I’ve been having some chest pain and shortness of breath.\nDoctor: How severe is the pain?\nPatient: It’s sharp, and it comes and goes.\nDoctor: When did it start?\nPatient: It started two days ago.\nDoctor: Any history of heart problems?\nPatient: Yes, my father had heart disease.\nDoctor: We’ll need to do an ECG and some blood tests to check your heart health.",
        'system_output': "Patient has sharp chest pain and shortness of breath for two days. Family history of heart disease. Doctor will perform an ECG and blood tests to assess heart health."
    },
    {
        'source': "Doctor: How are you feeling today?\nPatient: I’ve had a sore throat and a cough for the past few days.\nDoctor: Any fever or difficulty swallowing?\nPatient: Yes, I’ve had a low fever, but swallowing is fine.\nDoctor: Any history of allergies or similar symptoms?\nPatient: Not really.\nDoctor: It could be a viral infection. I recommend rest, fluids, and maybe some over-the-counter medicine.",
        'system_output': "Patient reports sore throat, cough, and a low fever. Doctor advises rest, fluids, and over-the-counter medication as the symptoms suggest a viral infection."
    },
    {
        'source': "Doctor: What’s bothering you today?\nPatient: I’ve been experiencing frequent headaches and some nausea.\nDoctor: How often do you get the headaches?\nPatient: It’s been almost every day for the past week.\nDoctor: Any other symptoms like blurred vision or dizziness?\nPatient: No, just the headache and nausea.\nDoctor: We’ll schedule an MRI to get a better understanding of the issue.",
        'system_output': "Patient complains of daily headaches and nausea for the past week. No blurred vision or dizziness. Doctor will schedule an MRI for further evaluation."
    }
]
score = evaluate(data, print_result=True)
print(score)

weights = np.array([1, 2, 3]) #'coherence', 'consistency', 'fluency'
weighted_score = []

for array1 in score:
    result = np.where(
        array1 < 0.5,          # Condition
        -array1 * weights,     # If True: make product negative
        array1 * weights       # If False: normal multiplication
    )
    
    sum_products = np.sum(result)
    final_result = sum_products/3
    
    weighted_score.append(final_result)

        
print(weighted_score)

scores = [torch.tensor([x], dtype=torch.float64) for x in weighted_score]
print(scores)  # Output: torch.float64

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Evaluating coherence of 4 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.67it/s]


Evaluating consistency of 4 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 10.26it/s]


Evaluating fluency of 4 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 22.79it/s]



Evaluation scores are shown below:
+-------------+----------+
|  Dimensions |  Score   |
+-------------+----------+
|  coherence  | 0.449252 |
| consistency | 0.64936  |
|   fluency   | 0.385504 |
|   overall   | 0.494705 |
+-------------+----------+
[[0.76403181 0.84311455 0.48589937]
 [0.45844862 0.71566303 0.44122299]
 [0.55973144 0.72223429 0.35135146]
 [0.0147952  0.31642992 0.26354302]]
[np.float64(0.3308542634136033), np.float64(-0.11693050848950663), np.float64(0.3167152151413462), np.float64(-0.4794280308898093)]
[tensor([0.3309], dtype=torch.float64), tensor([-0.1169], dtype=torch.float64), tensor([0.3167], dtype=torch.float64), tensor([-0.4794], dtype=torch.float64)]


In [238]:
sample_data = []

for q,r in zip(game_data["query"], game_data["response"]):
    temp = {}

    temp["source"] = q
    temp["system_output"] = r

    sample_data.append(temp)

    break

print(sample_data)

[{'source': '[positive] [doctor] karen is a 34 -year-old female with a history of chronic migraines and hypertension who is here today with abdominal pain so hi', 'system_output': " dr. Ochsner, nice to meet you.\n\nDr. Ochsner: good to meet you as well. It's great to see you back in"}]


In [264]:
score = evaluate(sample_data, print_result=True, overall=False)

Evaluating coherence of 1 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.30it/s]


Evaluating consistency of 1 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.85it/s]


Evaluating fluency of 1 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 22.64it/s]


Evaluation scores are shown below:
+-------------+----------+
|  Dimensions |  Score   |
+-------------+----------+
|  coherence  | 0.702612 |
| consistency | 0.684789 |
|   fluency   | 0.566372 |
+-------------+----------+



