In [None]:
# !pip install trl==0.7.4
# !pip install datasets
# !pip install transformers==4.38.2
# !pip install peft==0.10.0
# !pip install accelerate==0.28.0

In [1]:
import os
os.environ["HF_HOME"] = r"D:\hf-cache"

In [3]:
import random

import numpy as np
import torch
import pandas as pd

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator,
)

def set_seed(seed_val=42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

#Configuration options
train_batch_size = 16
gradient_accumulation_steps = 1
learning_rate = 1e-5
eval_batch_size = 1
eval_steps = 500
max_input_length = 550
save_steps = 1000
num_train_epochs = 20
random.seed(42)




  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)

from sklearn.model_selection import train_test_split

## Creating the policy model for human Evaluation

In [7]:
df = pd.read_csv("combined_clinical_notes.csv")

# First split: 60% train, 40% temp (eval + test)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)

# Second split: 50% of temp (which is 20% of total) for eval, 50% for test
eval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [9]:
for column in df.columns:
    print(column)  # Prints each column name

dataset
encounter_id
dialogue
note
source_file
id


In [60]:
import json

import pandas as pd
import torch
from datasets import load_dataset
from torch.utils.data import Dataset

# 1) 4-bit quant config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# 2) Load base model in 4-bit
base_model = AutoModelForCausalLM.from_pretrained(
    "HPAI-BSC/Qwen2.5-Aloe-Beta-7B",
    quantization_config=bnb_config,
    device_map="auto",
)
base_model = prepare_model_for_kbit_training(base_model)

tokenizer = AutoTokenizer.from_pretrained("HPAI-BSC/Qwen2.5-Aloe-Beta-7B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# 3) Apply LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], 
)
model = get_peft_model(base_model, lora_config)

# 4) Enable gradient checkpointing
model.enable_input_require_grads()
model.gradient_checkpointing_enable()
base_model.config.use_cache = False

# 5) Prepare data with smaller sequence length
# notechat = load_dataset("akemiH/NoteChat")
# trainB = notechat["train"].select(range(3000))
# evalB = notechat["train"].select(range(3000, 3500))

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:08<00:00,  2.23s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
class TLDRDataset(Dataset):
    def __init__(self, train_path, tokenizer, split, max_length=256):
        self.post_list = []
        dataset = train_df
        self.labels = []

        for sample in dataset.iterrows():
            self.post_list.append(sample[1]["dialogue"])
            self.labels.append(sample[1]["note"])

        self.tokenizer = tokenizer
        self.max_length = max_length
        self.input_ids = []
        self.attn_masks = []

    def __len__(self):
        return len(self.post_list)

    def __getitem__(self, idx):
        txt = self.post_list[idx]
        # label = self.labels[idx]

        # encodings_dict = self.tokenizer(txt, truncation=True, max_length=self.max_length, padding="max_length")
        # encodings_dict_label = self.tokenizer(label,truncation=True, max_length=self.max_length, padding="max_length")
        # input_ids = torch.tensor(encodings_dict["input_ids"])
        # attn_masks = torch.tensor(encodings_dict["attention_mask"])
        # labels_ids = torch.tensor(encodings_dict_label["input_ids"])
        # return {
        #     "input_ids": input_ids,
        #     "attention_mask": attn_masks,
        #     "labels": labels_ids,
        # }

        # text = f"CONVERSATION:\n{conversation}\n\nSUMMARY: \n{summary}" IMPORTANT!!!!!!!!!!

        encodings = self.tokenizer(
            txt,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )
        input_ids = encodings["input_ids"].squeeze()
        attention_mask = encodings["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": input_ids.clone(),
        }



In [12]:
model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.end_token_id = tokenizer.eos_token_id
model.config.pad_token_id = model.config.eos_token_id

# Disable caching (already done, but double-check)
model.config.use_cache = False

# Enable gradient checkpointing (already done, but confirm)
model.gradient_checkpointing_enable()

In [68]:
# Set up the datasets
data_path = "NA"
train_dataset = TLDRDataset(
    data_path,
    tokenizer,
    "train",
    max_length=256,
)


In [70]:
for i in train_dataset:
    print(i["input_ids"], i["labels"])
    break

tensor([   58, 36983,    60,   773, 13580,  1536,   374,   264,   220,    20,
           21,   481,  3157,  6284,  8593,   879, 18404,  3351, 41026,   315,
        16829, 90647,   323,   702,   264,  3267,  6457,  3840,   315, 19754,
          323,  1550,  6543,  7262,   773, 13580,  1536,  3291,   752,  1128,
          594,  2087,   389,   448,   697, 90647,  3491,   319,    58, 22722,
           60,  1632,   432,   594,  1101,  1012,  2494,   429,   600,   614,
        13686,   916,   279,  1537,   498,  1414,  3040,   311,  4236,  5555,
          432,  4977,  1075,  2494,   374,  2677, 15700,   304,   847, 27591,
          323,   498,  1414,  1101,   600, 33390,   728,   911,   847,  1899,
          323,   432, 33390, 81354,   752,   498,  1414,   264,  2632,  2699,
         7025,   264,  2632, 25118,   323,   979,   600,  1430,   311,  8180,
          432,  1101,  4977,  1075,   600,   614,   311,   600,   653,   308,
          944,  1414,   979,   432,   594,  2494,  6437,   600, 

In [50]:
torch.cuda.set_device(0)

In [20]:
output_dir = "./qwen-aloe-rl-main-4-4"

In [72]:
# Prepare the trainer and start training
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
#     per_device_eval_batch_size=eval_batch_size,
    fp16=True,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    warmup_steps=100,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    dataloader_pin_memory=True,
    dataloader_drop_last=True,
    gradient_checkpointing=True,
)

In [54]:
training_args.device.index

0

In [74]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
#     compute_metrics=compute_metrics,
#     data_collator=default_data_collator,
#     preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    tokenizer=tokenizer,
)
trainer.train()
# trainer.save_model(output_dir)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)


Step,Training Loss


  return fn(*args, **kwargs)


TrainOutput(global_step=4, training_loss=2.320481061935425, metrics={'train_runtime': 109.1404, 'train_samples_per_second': 5.094, 'train_steps_per_second': 0.037, 'total_flos': 5564529698144256.0, 'train_loss': 2.320481061935425, 'epoch': 1.88})

In [None]:
trainer.save_model("qwen-aloe-rl-main-4-4/")   ##path to save policy model
tokenizer.save_pretrained("./qwen-aloe-rl-main-4-4/")
# model.save_pretrained("my_lora_adapter")  



In [11]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("qwen-aloe-rl-main-4-4/")
model_path = "HPAI-BSC/Qwen2.5-Aloe-Beta-7B"

tokenizer = AutoTokenizer.from_pretrained(model_path, truncation=True, max_length=256, padding="max_length")
text = train_df.iloc[2]["dialogue"]
tokenized_text = tokenizer(text, return_tensors="pt", max_length=256)
tokenized_text

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:14<00:00,  3.54s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': tensor([[   58, 36983,    60, 15588,  1154, 23902,  1168,   659,  1246,   525,
           498,   937,   319,    58, 22722,    60,   600,  2776,  1661,   659,
          1246,   911,   498,   937,   319,    58, 36983,    60,   600,  2776,
          1661,   659,   773,   512,    12,   525,   498,  5527,   311,   633,
          3855,   937,   319,    58, 22722,    60,   600,  1079, 79141,    58,
         36983,    60, 16910,   659, 23902,  1168,   374,   264,   220,    20,
            21,  4666,  6284,  8593,  1588,   448, 34563, 10072, 14613,   659,
           773,  1154,   600,  3003,  6617,   498,  1033,   304,   279,  2714,
          1154, 23902,  1168,  1154,   323,   807,  1730,   429,   498,  1030,
           264,  3347, 17280, 93755, 79141,    58, 22722,    60, 75446, 79141,
            58, 36983,    60,  1033,   498,  3432,  1045,   294, 96645,   323,
          1045,   326,  1090, 11417,   291,  2090,   937,   319,    58, 22722,
            60,   600,   572,  1602,  

# Policy Model

In [1]:
import os
os.environ["HF_HOME"] = r"D:\hf-cache"

In [3]:
import torch
import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from trl import RewardTrainer
from datasets import Dataset
import json
import pandas as pd
from transformers import Trainer, TrainingArguments
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
DATA = r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv"
df = pd.read_csv(DATA)

# First split: 60% train, 40% temp (eval + test)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)

# Second split: 50% of temp (which is 20% of total) for eval, 50% for test
eval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
train_df.head(1)

Unnamed: 0,dataset,encounter_id,dialogue,note,source_file,id
246,aci,,[doctor] so tyler is a 56 -year-old male who p...,SUBJECTIVE\r\n\r\nDifficulty swallowing. Tyler...,src_experiment_data\test1_aci_asrcorr.csv,ACI084


In [5]:
# Print column names
print("\nColumns in the combined DataFrame:")
print(df.columns.tolist())


Columns in the combined DataFrame:
['dataset', 'encounter_id', 'dialogue', 'note', 'source_file', 'id']


In [6]:
##model path
# MODEL_PATH = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rm_model"
MODEL_PATH = r"D:\kshitij-weights-folder\qwen-aloe-rl-main-4-4"

In [7]:
# df = pd.read_parquet(DATA_PATH)
df = pd.read_csv(DATA)
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['dataset', 'encounter_id', 'dialogue', 'note', 'source_file', 'id'],
    num_rows: 464
})

In [None]:
sentiment_pipe_kwargs = {"top_k": None, "function_to_apply": "none"}
# project_kwargs={"logging_dir": PATH_TO_LOGS}

config = PPOConfig(
    model_name=MODEL_PATH, steps=51200, learning_rate=1.41e-5, remove_unused_columns=True, log_with='tensorboard', **project_kwargs
)

In [15]:
dataset = dataset.rename_columns({"dialogue": "review"})
# dataset = dataset.filter(lambda x: len(x["review"]) > 500, batched=False)
# dataset = dataset.map(lambda x: {"review": x["review"][:1000]}, batched=False)

In [17]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
txt_in_len = 5
txt_out_len = 32
seed = 1


dataset = dataset.map(
    lambda x: {"input_ids": tokenizer.encode(x["review"], return_tensors="pt", truncation=True, padding="max_length", max_length=2693)[0]},
    batched=False,
)
dataset = dataset.map(lambda x: {"query": tokenizer.decode(x["input_ids"], skip_special_tokens=True)}, batched=False)

from datasets import Dataset

# dataset = Dataset.from_dict(dataset)
dataset.set_format("pytorch")

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 464/464 [00:02<00:00, 214.41 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 464/464 [00:08<00:00, 51.82 examples/s]


In [97]:
def collator(data):
    # Create base batch with all existing fields
    
    batch = {
        key: [d[key] for d in data] 
        for key in data[0]  # Gets all keys from first item
    }
    
    # Add 'note' field (replace None with your default value if needed)
    # batch['note'] = [d.get('note', None) for d in data]
    
    return batch

In [121]:
def custom_collate_fn(batch):
    return {
        "input_ids": torch.stack([item["input_ids"] for item in batch]),
        "query": [item["query"] for item in batch],
    }

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# starcoder_model = AutoModelForCausalLMWithValueHead.from_pretrained("/content/drive/MyDrive/Medical Dialogue Summarization using PPO/summarization_policy_new")  ##policy model from step 1
starcoder_model = AutoModelForCausalLMWithValueHead.from_pretrained(MODEL_PATH).to(device)

# IMPORTANT!!!!
# model = AutoModelForCausalLMWithValueHead.from_pretrained(peft_model, peft_config=lora_config)

# starcoder_model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(rf_model_path).to(device) ## reward model from step 2


starcoder_tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) ## tokenizer of step 1 model., here since we are using same model for step 1 and 2 it doesnot matter
starcoder_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

cuda


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.64s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [107]:
dataset.column_names

['dataset',
 'encounter_id',
 'review',
 'note',
 'source_file',
 'id',
 'input_ids',
 'query']

In [None]:
import torch
# optimizer = torch.optim.SGD(starcoder_model.parameters(), lr=config.learning_rate)

# 2. Create optimizer
# optimizer = bnb.optim.Adam8bit(model.parameters(), lr=config.learning_rate)

ppo_trainer = PPOTrainer(config, starcoder_model, starcoder_model, starcoder_tokenizer, dataset=dataset, data_collator=custom_collate_fn, optimizer=optimizer)

In [29]:
from random import choices
from tqdm import tqdm
import time
import numpy as np

In [125]:
for batch in tqdm(ppo_trainer.dataloader):
    print(type(batch))

    break

  0%|                                                                                                                                                                                     | 0/1 [00:00<?, ?it/s]

<class 'dict'>





In [127]:
batch.keys()

dict_keys(['input_ids', 'query'])

In [129]:
import pandas as pd
from datasets import Dataset
import torch

# Collect all batches from the dataloader
all_data = []
for batch in ppo_trainer.dataloader:
    # Convert tensors to text
    texts = [ppo_trainer.tokenizer.decode(input_ids, skip_special_tokens=True) 
             for input_ids in batch["input_ids"]]
    
    # Convert tensors to lists properly
    input_ids_list = [tensor.cpu().numpy().tolist() for tensor in batch["input_ids"]]
    # attention_mask_list = [tensor.cpu().numpy().tolist() for tensor in batch["attention_mask"]]
    
    # Store batch data
    batch_data = {
        "text": texts,
        "input_ids": input_ids_list,
        # "attention_mask": attention_mask_list,
    }
    all_data.extend(batch_data["text"])  # Only extend texts if you want flat structure

# Save as JSON (now works)
df = pd.DataFrame(all_data)
# df.to_json("ppo_data.json", orient="records")
df.to_csv("ppo_data.csv", index=False)  # CSV format

In [None]:
for i in ppo_trainer.dataloader:
  print(i)
  break

In [35]:
ctrl_str = ["[negative]", "[positive]"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # this should be handled by accelerate
ctrl_tokens = dict((s, starcoder_tokenizer.encode(s, return_tensors="pt").squeeze().to(device)) for s in ctrl_str)


In [131]:
#NOT NECESSARY RIGHT NOW

def pos_logit_to_reward(logit, task):
    """
    Take the positive sentiment logit and scale it for the task.
        task [negative]: reward = -logit
        task [neutral]: reward = -2*abs(logit)+4
        task [positive]: reward = logit
    """
    for i in range(len(logit)):
        if task[i] == "[negative]":
            pass
        elif task[i] == "[positive]":
            pass
        else:
            raise ValueError("task has to be in [0, 1, 2]!")
    return logit

In [39]:
pos_logit_to_reward(torch.Tensor([4, 4]), ctrl_str)

tensor([4., 4.])

In [133]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": starcoder_tokenizer.eos_token_id,
    "max_new_tokens": 32,
    "eos_token_id": -1,
}

In [143]:
from prettytable import PrettyTable

def convert_to_json(output_list, src_list=None, ref_list=None, context_list=None, \
            scores=None, doc_id=None, system_id=None):
    """
        Convert the data into the json format.

        output_list: a list of model output
        src_list: source input for different NLG tasks. For example, source document for summarization
                  and dialogue history for dialogue response generation
        ref_list: human-annotated groundtruth
        context_list: the context needed to evaluate several specific dimension. For example,
                      additional factual information when evaluating engagingness and groundedness in dialogues
        scores: human scores for evaluating the model output. They can be used to calculate the correlation
                between evaluators and human judgements. The scores should be stored in a dictionary. For example,
                {'fluency': 2.0, 'coherence': 3.0} could be the human score for a sample.
        doc_id: the index of the input source. It can be used to calculate summary-level correlation for summarzation
        system_id: the index of the generation system. It can be used to calculate system-level correlation.
    """
    json_data = []
    for i in range(len(output_list)):
        cur = {}
        cur['system_output'] = output_list[i]
        if src_list is not None:
            cur['source'] = src_list[i]
        if ref_list is not None:
            cur['reference'] = ref_list[i]
        if context_list is not None:
            cur['context'] = context_list[i]
        if scores is not None:
            cur['scores'] = scores[i]
        if doc_id is not None:
            cur['doc_id'] = doc_id[i]
        if system_id is not None:
            cur['system_id'] = system_id[i]
        json_data.append(cur)
    return json_data


def add_question(dimension, output, src=None, ref=None, context=None, task=None):
    """
        Add questions to generate input in Bool-QA format for UniEval.

        dimension: specific dimension to be evaluated
        src: source input for different NLG tasks. For example, source document for summarization
             and dialogue history for dialogue response generation.
        output: output text generated by the models
        ref: human-annotataed groundtruth
        context: the context needed to evaluate several specific dimension. For example,
                 additional factual information when evaluating engagingness and groundedness in dialogues.
    """

    input_with_question = []
    for i in range(len(output)):
        # For summarization
        if task == 'summarization':
            if dimension == 'fluency':
                cur_input = 'question: Is this a fluent paragraph? </s> paragraph: ' + output[i]
            elif dimension == 'coherence':
                cur_input = 'question: Is this a coherent summary to the document? </s> summary: ' + output[i] + ' </s> document: ' + src[i]
            elif dimension == 'consistency':
                cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[i] + ' </s> document: ' + src[i]
            elif dimension == 'relevance':
                cur_input = 'question: Is this summary relevant to the reference? </s> summary: ' + output[i] + ' </s> reference: ' + ref[i]
            else:
                raise NotImplementedError('The input format for this dimension is still undefined. Please customize it first.')
        # For dialogues
        elif task == 'dialogue':
            if dimension == 'naturalness':
                cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + output[i]
            elif dimension == 'coherence':
                cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: '\
                            + output[i] + ' </s> dialogue history: ' + src[i]
            elif dimension == 'engagingness':
                cur_input = 'question: Is this an engaging and informative response according to the dialogue history and fact? </s> response: '\
                            + output[i] + ' </s> dialogue history: ' + src[i] + ' </s> fact: ' + context[i]
            elif dimension == 'groundedness':
                cur_input = 'question: Is this response consistent with knowledge in the fact? </s> response: '\
                            + output[i] + ' </s> fact: ' + context[i]
            elif dimension == 'understandability':
                cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + output[i]
            else:
                raise NotImplementedError('The input format for this dimension is still undefined. Please customize it first.')
        # For data-to-text
        elif task == 'data2text':
            if dimension == 'naturalness':
                cur_input = 'question: Is this a fluent utterance? </s> utterance: ' + output[i]
            elif dimension == 'informativeness':
                cur_input = 'question: Is this sentence informative according to the reference? </s> sentence: '\
                            + output[i] + ' </s> reference: ' + ref[i]
            else:
                raise NotImplementedError('The input format for this dimension is still undefined. Please customize it first.')
        # For factual consistency detection
        elif task == 'fact':
            if dimension == 'consistency':
                cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[i] + ' </s> document: ' + src[i]
            else:
                raise NotImplementedError('No other dimensions for the factual consistency detection task.')
        # For new customized tasks
        else:
            raise NotImplementedError('Other tasks are not implemented, please customize specific tasks here.')
        input_with_question.append(cur_input)
    return input_with_question


def print_scores(scores):
    table = PrettyTable(['Dimensions','Score'])
    print('\nEvaluation scores are shown below:')
    dims = list(scores[0].keys())
    for dim in dims:
        cur_score = 0
        for i in range(len(scores)):
            cur_score += scores[i][dim]
        table.add_row([dim, round(cur_score / len(scores), 6)])
    print(table)

In [145]:
import numpy as np
from nltk import sent_tokenize
from scorer import UniEvaluator  # Make sure this import works after placing scorer.py in the same directory

def evaluate(data, dims=None, overall=True, print_result=False, model_name_or_path="t5-small", task='summarization', device='cuda:0', individual=True):
    """
    Get the scores of all the given dimensions (fluency, consistency, coherence, relevance)

    data: A list of dictionaries, where each dictionary contains:
          - 'source': The original text
          - 'system_output': The generated system output (summary)
          - 'reference' (optional): Reference summary for relevance evaluation

    dims: A list of dimensions to be evaluated. If dims is None, it evaluates four default dimensions:
          coherence, consistency, fluency, relevance.

    overall: Boolean to indicate whether the overall score is calculated as the average of all dimensions.

    print_result: Boolean to print the results on the screen.

    model_name_or_path: The model name or path to use for evaluation, e.g., 't5-small'

    task: The task type (used in scoring if needed, like summarization or other NLP tasks).

    device: The device to use for evaluation ('cpu' or 'cuda:0').
    """

    # Instantiate the scorer
    scorer = UniEvaluator(model_name_or_path=model_name_or_path, device=device)

    n_data = len(data)
    eval_scores = [{} for _ in range(n_data)]

    # Default dimensions if not provided
    if dims is None:
        dims = ['coherence', 'consistency', 'fluency']   #add relevance

    for dim in dims:
        print(f'Evaluating {dim} of {n_data} samples !!!')

        if dim == 'consistency' or dim == 'fluency':
            # Sentence-level scores for consistency and fluency
            src_list, output_list = [], []
            n_sents = []  # number of sentences in each summary

            for i in range(n_data):
                if dim == 'consistency':
                    source = data[i]['source']
                else:
                    source = ''
                system_outputs = sent_tokenize(data[i]['system_output'])
                n_sents.append(len(system_outputs))
                for j in range(len(system_outputs)):
                    src_list.append(source)
                    output_list.append(system_outputs[j])

            input_list = add_question(dimension=dim, output=output_list, src=src_list, task=task)
            sent_score = scorer.score(input_list)

            # Calculate average sentence-level scores for each sample
            start_idx = 0
            score = []
            for cur_n_sent in n_sents:
                score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent)
                start_idx += cur_n_sent

        elif dim == 'coherence' or dim == 'relevance':
            # Summary-level scores for coherence and relevance
            src_list, output_list, ref_list = [], [], []

            for i in range(n_data):
                src_list.append(data[i]['source'])
                output_list.append(data[i]['system_output'])
                if dim == 'relevance':
                    ref_list.append(data[i]['reference'])

            input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=task)
            score = scorer.score(input_list)

        else:
            raise NotImplementedError(f"The input format for the dimension '{dim}' is still undefined. Please customize it.")

        # Store the scores for the current dimension
        for i in range(n_data):
            eval_scores[i][dim] = score[i]

    # Calculate overall score (average of all evaluated dimensions)
    if overall:
        for i in range(n_data):
            eval_scores[i]['overall'] = np.mean([eval_scores[i][dim] for dim in dims])

    # Print the result if requested
    if print_result:
        print_scores(eval_scores)

    if individual:
        individual_scores = []
        for i in range(n_data):
            temp = [eval_scores[i][dim] for dim in dims]
            individual_scores.append(temp)

        return np.array(individual_scores)

    # Calculate average score across all the dimensions except 'overall'
    avg_score = []
    for i in range(n_data):
        # Exclude 'overall' from the averaging
        dimensions = [dim for dim in dims if dim != 'overall']
        avg_score.append(np.mean([eval_scores[i][dim] for dim in dimensions]))

    return avg_score


In [None]:
def get_score(game_data):
    # for i in responses:
    #     instructions = tokenizer.encode_plus(
    #                                        i,
    #                                        padding="max_length",
    #                                        max_length=32,
    #                                        return_tensors="pt")

    #     instructions = {key: value.to(device) for key, value in instructions.items()}

    #     with torch.no_grad():
    #         outputs = model(**instructions)

    #     logits = outputs[0].mean()
    #     positive_logist.append(logits)
    
    weights = np.array([0.1, 0.2, 0.3]) #'coherence', 'consistency', 'fluency'

    sample_data = []

    for q,r in zip(game_data["query"], game_data["response"]):
        temp = {}
    
        temp["source"] = q
        temp["system_output"] = r
    
        sample_data.append(temp)
    
    score = evaluate(sample_data, overall=False)

    weighted_score = []

    for array1 in score:
        result = np.where(
            array1 < 0.5,          # Condition
            -array1 * weights,     # If True: make product negative
            array1 * weights       # If False: normal multiplication
        )
        
        sum_products = np.sum(result)
        final_result = sum_products/3
        
        weighted_score.append(final_result)

    scores = [torch.tensor([x], dtype=torch.float64) for x in weighted_score]

    return scores

In [137]:
# Check if models are on GPU
print(next(starcoder_model.parameters()).device)  # Should print: cuda:0
# print(next(starcoder_model_ref.parameters()).device)  # Should print: cuda:0

cuda:0


In [None]:
from random import choices
from tqdm import tqdm
import time
import numpy as np

for epoch in range(1):
    for batch in tqdm(ppo_trainer.dataloader):
        (logs, game_data,) = (
            dict(),
            dict(),
        )

        # task_list = choices(ctrl_str, k=config.batch_size)
        # game_data["query"] = [t + q for t, q in zip(task_list, batch["query"])]
        game_data["query"] = [q for q in batch["query"]]
        # query_tensors = [torch.cat((ctrl_tokens[t], input_ids)) for t, input_ids in zip(task_list, batch["input_ids"])]
        query_tensors = [input_ids for input_ids in batch["input_ids"]]
        
        response_tensors = []
        for query in query_tensors:
            response = ppo_trainer.generate(query, **generation_kwargs)
            response_tensors.append(response.squeeze())
#         print(response_tensors)
        game_data["response"] = [starcoder_tokenizer.decode(r) for r in response_tensors]

        print("check")

        texts = [q + r for q, r in zip(batch["query"], game_data["response"])]
        logits = get_score(game_data)
        # rewards = pos_logit_to_reward(logits, task_list)
        # rewards = [torch.tensor([1.0], device=query_tensors[0].device) for _ in range(len(texts))]

        #### Run PPO training
        t = time.time()
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)

        # Log for mean reward and KL Divergence!!!!!!!!!!!!!!!
        # for cs in ctrl_str:
        #     key = "env/reward_" + cs.strip("[]")
        #     stats[key] = np.mean([r.cpu().numpy() for r, t in zip(rewards, task_list) if t == cs])
        # ppo_trainer.log_stats(stats, game_data, rewards)

  0%|                                                                                                                                                                                     | 0/1 [00:00<?, ?it/s]

In [None]:
###saving the model
# starcoder_model.save_pretrained("rhlfmodel/")
# starcoder_tokenizer.save_pretrained("rhlfmodel/")

ppo_trainer.model.pretrained_model.save_pretrained("D:\kshitij-weights-folder\qwen-aloe-rl-trained-4-4")
starcoder_tokenizer.save_pretrained("D:\kshitij-weights-folder\qwen-aloe-rl-trained-4-4")

In [None]:
# -- Path to your LoRA weights + tokenizer --
model_dir = "D:\kshitij-weights-folder\qwen-aloe-rl-trained-4-4"  

# -- 4-bit quantization config (same as training) --
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# -- 1) Load the *base* Qwen2.5 model in 4-bit --
base_model = AutoModelForCausalLM.from_pretrained(
    "HPAI-BSC/Qwen2.5-Aloe-Beta-7B",
    quantization_config=bnb_config,
    device_map="auto",
)

# model = AutoModelForCausalLM.from_pretrained(base_model_name)
# model = PeftModel.from_pretrained(model, adapter_model_name)

# -- 2) Load your fine-tuned LoRA adapters into the base model --
# The directory should contain adapter_model.bin, adapter_config.json, etc.
lora_model = PeftModel.from_pretrained(
    base_model, 
    model_dir,
    device_map="auto",
)

# -- 3) Load the tokenizer you saved to ./aloe_qwen --
tokenizer = AutoTokenizer.from_pretrained(model_dir)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [None]:
def return_prompt(conv):
    prompt = f"""
    CONVERSATION:
    {conv}
    
    SUMMARY:
    Please generate a medical summary based on the following clinical notes. The summary should include the following sections: Chief Complaint, History of Present Illness, Vitals, Physical Exam, Results, Assessment, Plan, and Instructions.
    Please format the response as plain text, without using markdown or special formatting, and with clear headings for each section, like this:
    
    
    CHIEF COMPLAINT
    A concise statement of the patient's primary concern or reason for visiting the clinic.
    
    HISTORY OF PRESENT ILLNESS
    A detailed narrative about the patient's symptoms, their onset, duration, and any relevant medical history or previous treatments.
    
    VITALS
    Include any relevant vital signs (e.g., oxygen saturation, blood pressure) if available.
    
    PHYSICAL EXAM 
    Summarize the findings from the physical examination, including any notable abnormalities.
    
    RESULTS 
    Summarize the results of any diagnostic tests performed (e.g., lab work, imaging studies).
    
    ASSESSMENT
    The doctor's assessment of the patient's condition or diagnosis.
    
    PLAN
    The treatment plan, including prescribed medications, lifestyle recommendations, and follow-up instructions.
    
    INSTRUCTIONS
    Specific instructions for the patient regarding their treatment plan and follow-up care.
    
    Important Note: If any section lacks relevant information or if specific details are not provided (e.g., vitals are not mentioned, no abnormal findings in the physical exam), omit that section from the generated summary. Only include sections for which there is sufficient information available.
    """

    return prompt

In [None]:
from transformers import pipeline
summarizer = pipeline(
    "text-generation",
    model=lora_model,
    tokenizer=tokenizer,
    device_map="auto",  # Automatically place on GPU if available
)

# text_gen_pipeline = pipeline(
#     "text-generation",
#     model=lora_model,
#     tokenizer=tokenizer,
#     device_map="auto",   # place on GPU if available
# )

# Now generate text using the pipeline
# response = text_gen_pipeline(
#     prompt,
#     max_new_tokens=900,
#     do_sample=False,  # Deterministic for demonstration
# )

# notechat = load_dataset("akemiH/NoteChat")
# eval_df = notechat["train"].select(range(5000, 7001)).to_pandas()
eval_df = test_df

# 2) Clean up missing data if present
if eval_df.isnull().values.any():
    print("Found missing values in the evaluation set. Dropping them.")
    eval_df = eval_df.dropna()

# 3) Prepare batching parameters
batch_size = 4
num_samples = len(eval_df)
num_batches = (num_samples // batch_size) + int(num_samples % batch_size != 0)

# Lists to store predictions and references
predictions = []
references = []

# 4) Batching loop to generate summaries
for i in tqdm(range(num_batches), desc="Generating Summaries"):
    start = i * batch_size
    end = min(start + batch_size, num_samples)
    
    # Extract conversation and reference summary columns
    batch_conversations = eval_df["dialogue"][start:end].tolist()
    batch_refs = eval_df["note"][start:end].tolist()  # "data" column for references

    # Prepare prompts replicating training format
    prompts = [
        return_prompt(conv)
        for conv in batch_conversations
    ]
    
    # Generate summaries
    results = summarizer(
        prompts,
        max_new_tokens=900,
        do_sample=False,
        # truncation=True,
        # num_return_sequences=1  # Default is 1
    )
    
    # Parse results and extract summaries
    for item in results:
        # 'item' is a list with 1 dict => {"generated_text": "..."}
        output_dict = item[0]
        generated_text = output_dict["generated_text"]
        
        # Extract only the part after "SUMMARY:"
        # if "SUMMARY:" in full_text:
        #     extracted_summary = generated_text.split("SUMMARY:", 1)[-1].strip()
        # else:
        #     extracted_summary = generated_text  # Fallback if marker not found

        # first_occurrence = generated_text.lower().find("chief complaint")
        # second_occurrence = generated_text.lower().find("chief complaint", first_occurrence + 1)
        # third_occurrence = generated_text.lower().find("chief complaint", second_occurrence + 1)
        # fourth_occurrence = generated_text.lower().find("chief complaint", third_occurrence + 1)
        
        # if fourth_occurrence != -1:
        #     # Extract everything from the second occurrence of "CHIEF COMPLAINT" onward
        #     extracted_summary = generated_text[fourth_occurrence:].strip()
        # else:
        #     # If the second "CHIEF COMPLAINT" is not found, just use the original text
        #     extracted_summary = generated_text.strip()
        
        predictions.append(generated_text)
    
    # Append the reference texts
    references.extend(batch_refs)

In [None]:
src_list = eval_df["dialogue"].tolist()
ref_list = eval_df["note"].tolist()

In [None]:
output_list = []
for pred in predictions:
    # Ensure that "Summary:" exists in the string to avoid errors
    if len(pred) > 0:
        output_list.append(pred)
    else:
        # Handle cases where "Summary:" is missing (optional)
        output_list.append("")  # Or handle differently based on your needs

In [None]:
data = convert_to_json(
    src_list=src_list,
    ref_list=ref_list,
    output_list=output_list
)

filtered_data = [
    entry for entry in data
    if entry["system_output"].strip()  # Ensure non-empty system_output
]



In [None]:
from transformers import pipeline, set_seed
model_path = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel"
set_seed(42)
pipe = pipeline("text-generation",model=model_path, tokenizer=model_path, max_length=40, num_return_sequences=1)

In [None]:
text = dataset["rejected"][0]
print(text)
pipe(text)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


TL;DR:  My girlfriend and I broke up after she went through my Facebook account without my permission.<|endoftext|>Citizens for the Republic


[{'generated_text': 'TL;DR:  My girlfriend and I broke up after she went through my Facebook account without my permission.<|endoftext|>Citizens for the Republic'}]

In [None]:
save_directory = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel"

# Load the model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(save_directory).to(device)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

In [None]:
conversation = '''
Doctor: Hi, Mr. X, I'm Dr. Y. How are you feeling today?

Patient: Not too good, doctor. I've been feeling really sick lately.

Doctor: I understand. Can you tell me what symptoms you're experiencing?

Patient: Yes, I've been having a fever, a dry cough, and dyspnea.

Doctor: I see. You were hospitalized due to moderate ARDS from COVID-19, is that correct?

Patient: Yes, that's correct.

Doctor: During your physical therapy, we encountered some difficulties. Can you tell me more about that?

Patient: Yes, I had trouble with position changes and deep breathing. Every time I tried to change my position or take a deep breath, I would start coughing and it would make me really short of breath.

Doctor: I understand. To avoid rapid deterioration and respiratory failure, we instructed you to change positions very slowly and step-by-step, right?

Patient: Yes, that's right. It took about 30 minutes to change to the prone position.

Doctor: And I see that this approach increased your oxygen saturation, for example, on day 5 with 6 L/min of oxygen from 93% to 97%.

Patient: Yes, that's correct.

Doctor: Good. We also had to adapt your breathing exercises to avoid prolonged coughing and oxygen desaturation. Can you tell me more about that?

Patient: Yes, I was instructed to stop every deep breath before coughing and to hold my breath for better air distribution.

Doctor: I see that you performed the breathing exercises well and managed to increase your oxygen saturation.

Patient: Yes, I did my best.

Doctor: You also had difficulty maintaining sufficient oxygen saturation during physical activity, is that correct?

Patient: Yes, I did. But with close monitoring and frequent breaks, I was able to perform low-level strength and walking exercises without any significant deoxygenation.

Doctor: I see that your exercise progression was low on days 1 to 5, but then increased daily until your hospital discharge to a rehabilitation clinic on day 10.

Patient: Yes, that's correct.

Doctor: Great. I'd like to keep monitoring your progress and see how you're doing. Can you keep me updated on any changes in your symptoms?

Patient: Yes, of course, doctor.

Doctor: Alright, let's keep in touch. If you have any questions or concerns, don't hesitate to reach out to me.

Patient: Thank you, doctor.
'''

In [None]:
def generate_response(prompt, model, tokenizer, max_new_tokens=1000, temperature=0.1):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            temperature=temperature,
            top_k=50,
            top_p=0.9,
            do_sample=True,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id
        )


    return tokenizer.decode(output[0], skip_special_tokens=True)

prompt = "Generate a summary for the below conversation. Dont give me the prompt back. I just want the summary to be returned to me\n\n" + conversation
response = generate_response(prompt, model, tokenizer)
print("Generated Response:\n", response)


Generated Response:
 Generate a summary for the below conversation. Dont give me the prompt back. I just want the summary to be returned to me


Doctor: Hi, Mr. X, I'm Dr. Y. How are you feeling today?

Patient: Not too good, doctor. I've been feeling really sick lately.

Doctor: I understand. Can you tell me what symptoms you're experiencing?

Patient: Yes, I've been having a fever, a dry cough, and dyspnea.

Doctor: I see. You were hospitalized due to moderate ARDS from COVID-19, is that correct?

Patient: Yes, that's correct.

Doctor: During your physical therapy, we encountered some difficulties. Can you tell me more about that?

Patient: Yes, I had trouble with position changes and deep breathing. Every time I tried to change my position or take a deep breath, I would start coughing and it would make me really short of breath.

Doctor: I understand. To avoid rapid deterioration and respiratory failure, we instructed you to change positions very slowly and step-by-step, right?

Pat

In [None]:
print(dataset["review"][0][:100])

SUBREDDIT: r/relationships
TITLE: My [21/M] girlfriend [19/F] broke up with me after she went throug


In [None]:
model = AutoModelForCausalLM.from_pretrained("rlhfmodel/")
model_path = "bigcode/tiny_starcoder_py"

tokenizer = AutoTokenizer.from_pretrained(model_path, truncation=True, max_length=256, padding="max_length")
text = df.iloc[2]["prompt"]
tokenized_text = tokenizer(text, return_tensors="pt", max_length=256)

In [47]:
data = [
    {
        'source': "Doctor: Hello, how are you feeling today?\nPatient: I've been feeling a bit tired and dizzy.\nDoctor: How long has this been happening?\nPatient: For about a week now. I also have trouble sleeping.\nDoctor: I see. Have you been under a lot of stress lately?\nPatient: Yes, work has been quite stressful.\nDoctor: That could be contributing. Let’s do some tests to rule out other issues.",
        'system_output': "Patient reports tiredness, dizziness, and difficulty sleeping for a week. Work-related stress may be a factor. Doctor will conduct tests to check for other problems."
    },
    {
        'source': "Doctor: What brings you in today?\nPatient: I’ve been having some chest pain and shortness of breath.\nDoctor: How severe is the pain?\nPatient: It’s sharp, and it comes and goes.\nDoctor: When did it start?\nPatient: It started two days ago.\nDoctor: Any history of heart problems?\nPatient: Yes, my father had heart disease.\nDoctor: We’ll need to do an ECG and some blood tests to check your heart health.",
        'system_output': "Patient has sharp chest pain and shortness of breath for two days. Family history of heart disease. Doctor will perform an ECG and blood tests to assess heart health."
    },
    {
        'source': "Doctor: How are you feeling today?\nPatient: I’ve had a sore throat and a cough for the past few days.\nDoctor: Any fever or difficulty swallowing?\nPatient: Yes, I’ve had a low fever, but swallowing is fine.\nDoctor: Any history of allergies or similar symptoms?\nPatient: Not really.\nDoctor: It could be a viral infection. I recommend rest, fluids, and maybe some over-the-counter medicine.",
        'system_output': "Patient reports sore throat, cough, and a low fever. Doctor advises rest, fluids, and over-the-counter medication as the symptoms suggest a viral infection."
    },
    {
        'source': "Doctor: What’s bothering you today?\nPatient: I’ve been experiencing frequent headaches and some nausea.\nDoctor: How often do you get the headaches?\nPatient: It’s been almost every day for the past week.\nDoctor: Any other symptoms like blurred vision or dizziness?\nPatient: No, just the headache and nausea.\nDoctor: We’ll schedule an MRI to get a better understanding of the issue.",
        'system_output': "Patient complains of daily headaches and nausea for the past week. No blurred vision or dizziness. Doctor will schedule an MRI for further evaluation."
    }
]
score = evaluate(data, print_result=True)
print(score)

weights = np.array([1, 2, 3]) #'coherence', 'consistency', 'fluency'
weighted_score = []

for array1 in score:
    result = np.where(
        array1 < 0.5,          # Condition
        -array1 * weights,     # If True: make product negative
        array1 * weights       # If False: normal multiplication
    )
    
    sum_products = np.sum(result)
    final_result = sum_products/3
    
    weighted_score.append(final_result)

        
print(weighted_score)

scores = [torch.tensor([x], dtype=torch.float64) for x in weighted_score]
print(scores)  # Output: torch.float64

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Evaluating coherence of 4 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.67it/s]


Evaluating consistency of 4 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 10.26it/s]


Evaluating fluency of 4 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 22.79it/s]



Evaluation scores are shown below:
+-------------+----------+
|  Dimensions |  Score   |
+-------------+----------+
|  coherence  | 0.449252 |
| consistency | 0.64936  |
|   fluency   | 0.385504 |
|   overall   | 0.494705 |
+-------------+----------+
[[0.76403181 0.84311455 0.48589937]
 [0.45844862 0.71566303 0.44122299]
 [0.55973144 0.72223429 0.35135146]
 [0.0147952  0.31642992 0.26354302]]
[np.float64(0.3308542634136033), np.float64(-0.11693050848950663), np.float64(0.3167152151413462), np.float64(-0.4794280308898093)]
[tensor([0.3309], dtype=torch.float64), tensor([-0.1169], dtype=torch.float64), tensor([0.3167], dtype=torch.float64), tensor([-0.4794], dtype=torch.float64)]


In [238]:
sample_data = []

for q,r in zip(game_data["query"], game_data["response"]):
    temp = {}

    temp["source"] = q
    temp["system_output"] = r

    sample_data.append(temp)

    break

print(sample_data)

[{'source': '[positive] [doctor] karen is a 34 -year-old female with a history of chronic migraines and hypertension who is here today with abdominal pain so hi', 'system_output': " dr. Ochsner, nice to meet you.\n\nDr. Ochsner: good to meet you as well. It's great to see you back in"}]


In [264]:
score = evaluate(sample_data, print_result=True, overall=False)

Evaluating coherence of 1 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.30it/s]


Evaluating consistency of 1 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.85it/s]


Evaluating fluency of 1 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 22.64it/s]


Evaluation scores are shown below:
+-------------+----------+
|  Dimensions |  Score   |
+-------------+----------+
|  coherence  | 0.702612 |
| consistency | 0.684789 |
|   fluency   | 0.566372 |
+-------------+----------+





In [23]:
print(type(dataset))

<class 'datasets.arrow_dataset.Dataset'>


In [25]:
row = dataset[1]

In [27]:
print(dataset.column_names)

['dataset', 'encounter_id', 'review', 'note', 'source_file', 'id', 'input_ids', 'query']


In [29]:
print(row['query'])

[doctor] hi alan , how are you ?
[patient] hi , good to see you .
[doctor] good to see you as well . are you ready to get started ?
[patient] yes i am .
[doctor] so , alan is a 61-year-old male , here with complaints of shortness of breath .
[doctor] so alan , what's going on ?
[patient] so over the weekend , uh , i took a very long bike ride , about three hours or so and i got back , i was very light headed , i was short of breath and it really surprised me because i pretty much bike ride every morning and it was never an issue before so i'm a little concerned about that .
[doctor] okay , and have you had any other symptoms ? have you had any fever-
[patient] n-
[doctor] . chest pain ?
[patient] no , but i felt very light headed and , uh , i- it was hard to catch my breath afterwards , so maybe i did have a little chest pain but , uh , it , it eventually went away after about an hour .
[doctor] okay , um , and anyone else sick in your , in your family ?
[patient] no .
[doctor] okay , 

In [31]:
print(row['review'])

[doctor] hi alan , how are you ?
[patient] hi , good to see you .
[doctor] good to see you as well . are you ready to get started ?
[patient] yes i am .
[doctor] so , alan is a 61-year-old male , here with complaints of shortness of breath .
[doctor] so alan , what's going on ?
[patient] so over the weekend , uh , i took a very long bike ride , about three hours or so and i got back , i was very light headed , i was short of breath and it really surprised me because i pretty much bike ride every morning and it was never an issue before so i'm a little concerned about that .
[doctor] okay , and have you had any other symptoms ? have you had any fever-
[patient] n-
[doctor] . chest pain ?
[patient] no , but i felt very light headed and , uh , i- it was hard to catch my breath afterwards , so maybe i did have a little chest pain but , uh , it , it eventually went away after about an hour .
[doctor] okay , um , and anyone else sick in your , in your family ?
[patient] no .
[doctor] okay , 

In [211]:
from datasets import Dataset
import numpy as np
from tqdm import tqdm  # For progress bar

def analyze_review_lengths(dataset: Dataset, tokenizer, sample_size=None) -> dict:
    """
    Analyzes tokenized lengths of reviews in a HuggingFace dataset.
    
    Args:
        dataset: HF Dataset containing 'review' column
        tokenizer: Pre-trained tokenizer
        sample_size: Optional number of samples to analyze (None for full dataset)
    
    Returns:
        Dictionary with length statistics (in tokens)
    """
    # Get subset if sampling
    if sample_size and len(dataset) > sample_size:
        dataset = dataset.select(np.random.choice(len(dataset), sample_size, replace=False))
    
    lengths = []
    
    # Process with progress bar
    for item in tqdm(dataset, desc="Analyzing lengths"):
        encoded = tokenizer.encode(item["review"])
        lengths.append(len(encoded))
    
    return {
        "max": max(lengths),
        "median": int(np.median(lengths)),
        "mean": float(np.mean(lengths)),
        "95th_percentile": int(np.percentile(lengths, 95)),
        "histogram": np.histogram(lengths, bins=20)
    }

# Usage example:
stats = analyze_review_lengths(dataset, tokenizer)
print(f"""
Length Statistics (in tokens):
- Maximum: {stats['max']}
- Median: {stats['median']}
- Mean: {stats['mean']}
- 95th percentile: {stats['95th_percentile']}
""")

# Suggested max_length based on 95th percentile
suggested_max_length = min(2048, stats['95th_percentile'] + 50)  # Don't exceed 2048
print(f"Suggested max_length: {suggested_max_length}")

Analyzing lengths: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 464/464 [00:01<00:00, 307.17it/s]


Length Statistics (in tokens):
- Maximum: 3588
- Median: 1506
- Mean: 1589.3663793103449
- 95th percentile: 2642

Suggested max_length: 2048



