In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install trl==0.7.4
!pip install datasets
!pip install transformers==4.38.2
!pip install peft==0.10.0
!pip install accelerate==0.28.0

Collecting trl==0.7.4
  Downloading trl-0.7.4-py3-none-any.whl.metadata (10 kB)
Collecting datasets (from trl==0.7.4)
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting tyro>=0.5.11 (from trl==0.7.4)
  Downloading tyro-0.9.17-py3-none-any.whl.metadata (9.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.4.0->trl==0.7.4)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.4.0->trl==0.7.4)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.4.0->trl==0.7.4)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.4.0->trl==0.7.4)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-c

In [None]:
import random

import numpy as np
import torch
import pandas as pd

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator,
)

def set_seed(seed_val=42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

#Configuration options
train_batch_size = 16
gradient_accumulation_steps = 1
learning_rate = 1e-5
eval_batch_size = 1
eval_steps = 500
max_input_length = 550
save_steps = 1000
num_train_epochs = 20
random.seed(42)




## Creating the policy model for human Evaluation

In [None]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001-e8c59e5cf7bce1c0.parquet', 'test': 'data/test-00000-of-00001-59ffb27399371eac.parquet', 'valid': 'data/valid-00000-of-00001-0e33e6bd86e3edc9.parquet'}

In [None]:
df = pd.read_parquet("hf://datasets/CarperAI/openai_summarize_tldr/" + splits["test"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import json

import pandas as pd
import torch
from datasets import load_dataset
from torch.utils.data import Dataset


class TLDRDataset(Dataset):
    def __init__(self, train_path, tokenizer, split, max_length=256):
        self.post_list = []
        dataset = (pd.read_parquet(train_path))[:6000]
        self.labels = []

        for sample in dataset.iterrows():
            self.post_list.append(sample[1]["prompt"])
            self.labels.append(sample[1]["label"])

        self.tokenizer = tokenizer
        self.max_length = max_length
        self.input_ids = []
        self.attn_masks = []

    def __len__(self):
        return len(self.post_list)

    def __getitem__(self, idx):
        txt = self.post_list[idx]
        label = self.labels[idx]

        encodings_dict = self.tokenizer(txt, truncation=True, max_length=self.max_length, padding="max_length")
        encodings_dict_label = self.tokenizer(label,truncation=True, max_length=self.max_length, padding="max_length")
        input_ids = torch.tensor(encodings_dict["input_ids"])
        attn_masks = torch.tensor(encodings_dict["attention_mask"])
        labels_ids = torch.tensor(encodings_dict_label["input_ids"])
        return {
            "input_ids": input_ids,
            "attention_mask": attn_masks,
            "labels": labels_ids,
        }



In [None]:
tokenizer = AutoTokenizer.from_pretrained("bigcode/tiny_starcoder_py")
model = AutoModelForCausalLM.from_pretrained("bigcode/tiny_starcoder_py", use_cache=False).to("cuda:0")
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.end_token_id = tokenizer.eos_token_id
model.config.pad_token_id = model.config.eos_token_id



tokenizer_config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/532 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/657M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
# Set up the datasets
data_path = "hf://datasets/CarperAI/openai_summarize_tldr/" + splits["train"]
train_dataset = TLDRDataset(
    data_path,
    tokenizer,
    "train",
    max_length=256,
)


In [None]:
for i in train_dataset:
    print(i["input_ids"], i["labels"])
    break

tensor([ 7100,   613,  2918,   780,    44,   540,    33, 40186,   203, 13777,
           44,   439,   308,    88,    33,    36,    36,    27,  1159,   372,
        10986,   963,   415,   439,  2637,   372,  4204,  3702,  3282, 38233,
         3246,   556,   646,   461,  2268,   420,   332,   372,  9934,  7889,
          455,   299,   203,  3705,    44,  3182,  3654,   415,   458, 17732,
         2442,  1273,   561,  1182, 16587,   312,  1596,    32,  1659,   203,
         2495, 13519,    44,   203,  5786,   439,   308,    88,    33,    36,
           36,    27, 14236,  3919,  1672,  1932,  4011,  1626,   417,   225,
           36, 11274, 14818,  3301,   938,  5349,  5122,  2685,   312,  4947,
          432,   343,  1741,  1118,   363,   225,   561, 14064,  1558,   597,
         1829,  2784,   439, 10889,    32,  2030,  1597,   312, 24893, 46409,
         1133,   328,  1672, 12535,  7254,   372, 33091,   623,  1672, 35814,
          461,  8295,  7696,   322, 16554,   372, 11909,  7791, 

In [None]:
torch.cuda.set_device(0)

In [None]:
output_dir = "./Output"

In [None]:
# Prepare the trainer and start training
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
#     per_device_eval_batch_size=eval_batch_size,
    fp16=False,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=2,
    warmup_steps=100,
    logging_steps=10,
)

In [None]:
training_args.device.index

0

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
#     compute_metrics=compute_metrics,
#     data_collator=default_data_collator,
#     preprocess_logits_for_metrics=preprocess_logits_for_metrics
)
trainer.train()
# trainer.save_model(output_dir)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mvikrant21082003[0m ([33mvikrant21082003-bmsce[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,10.275
20,6.5608
30,2.5265
40,1.516
50,1.2139
60,1.0709
70,1.0871
80,1.0125
90,1.0238
100,1.0072


TrainOutput(global_step=750, training_loss=1.1941501706441244, metrics={'train_runtime': 1006.2822, 'train_samples_per_second': 11.925, 'train_steps_per_second': 0.745, 'total_flos': 2213755748352000.0, 'train_loss': 1.1941501706441244, 'epoch': 2.0})

In [None]:
trainer.save_model("summarization_policy_new/")   ##path to save policy model

In [None]:
import shutil
import os

source_dirs = ["/content/summarization_policy_new"]
destination = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO"

os.makedirs(destination, exist_ok=True)

# Copy each directory to the destination
for src in source_dirs:
    if os.path.exists(src):
        dest_path = os.path.join(destination, os.path.basename(src))
        shutil.copytree(src, dest_path, dirs_exist_ok=True)  # Copy with merging existing directories
        print(f"Copied {src} to {dest_path}")
    else:
        print(f"Skipping {src}, does not exist.")

print("Copy operation completed.")


Copied /content/summarization_policy_new to /content/drive/MyDrive/Medical Dialogue Summarization using PPO/summarization_policy_new
Copy operation completed.


In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("summarization_policy_new/")
model_path = "bigcode/tiny_starcoder_py"

tokenizer = AutoTokenizer.from_pretrained(model_path, truncation=True, max_length=256, padding="max_length")
text = df.iloc[2]["prompt"]
tokenized_text = tokenizer(text, return_tensors="pt", max_length=256)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


## Traning the reward function

In [None]:
import torch
import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from trl import RewardTrainer, SFTTrainer
from datasets import Dataset
import json
import pandas as pd
from transformers import Trainer, TrainingArguments




In [None]:
##model path
MODEL_PATH = "bigcode/tiny_starcoder_py"


splits = {'train': 'data/train-00000-of-00001-3cbd295cedeecf91.parquet', 'test': 'data/test-00000-of-00001-0845e2eec675b16a.parquet', 'valid1': 'data/valid1-00000-of-00001-b647616a2be5f333.parquet', 'valid2': 'data/valid2-00000-of-00001-2655c5b3621b6116.parquet'}
DATA_PATH = "hf://datasets/CarperAI/openai_summarize_comparisons/" + splits["test"]

In [None]:
df = pd.read_parquet(DATA_PATH)
df = df[:10]
raw_dataset = Dataset.from_pandas(df)
raw_dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 10
})

In [None]:
##defininig the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)



In [None]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
def formatting_func(examples):
    kwargs = {"padding": "max_length",
              "truncation": True,
              "max_length": 256,
              "return_tensors": "pt"
              }

    # Prepend the prompt and a line break to the original_response and response-1 fields.
    prompt_plus_chosen_response = examples["prompt"] + "\n" + examples["chosen"]
    prompt_plus_rejected_response = examples["prompt"] + "\n" + examples["rejected"]

    # Then tokenize these modified fields.
    tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs)
    tokens_rejected = tokenizer.encode_plus(prompt_plus_rejected_response, **kwargs)

    return {
        "input_ids_chosen": tokens_chosen["input_ids"][0], "attention_mask_chosen": tokens_chosen["attention_mask"][0],
        "input_ids_rejected": tokens_rejected["input_ids"][0], "attention_mask_rejected": tokens_rejected["attention_mask"][0]
    }

In [None]:
formatted_dataset = raw_dataset.map(formatting_func)
formatted_dataset = formatted_dataset.train_test_split()
raw_dataset

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 10
})

In [None]:
model.config

GPTBigCodeConfig {
  "_name_or_path": "bigcode/tiny_starcoder_py",
  "activation_function": "gelu_pytorch_tanh",
  "architectures": [
    "GPTBigCodeForCausalLM"
  ],
  "attention_softmax_in_fp32": true,
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "inference_runner": 0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "max_batch_size": null,
  "max_sequence_length": null,
  "model_type": "gpt_bigcode",
  "multi_query": true,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": 3072,
  "n_layer": 20,
  "n_positions": 8192,
  "pad_key_length": true,
  "pre_allocate_kv_cache": false,
  "resid_pdrop": 0.1,
  "scale_attention_softmax_in_fp32": true,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.38.2",
  "use_cache": true,
  "validate_runner_input": tr

In [None]:
### Loading the TRL reward trainer and training the trainer
training_args = TrainingArguments(
        output_dir="rm_checkpoint/",
        num_train_epochs=1,
        logging_steps=10,
        gradient_accumulation_steps=1,
        save_strategy="steps",
        evaluation_strategy="steps",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=1,
        eval_accumulation_steps=1,
        eval_steps=500,
        save_steps=500,
        warmup_steps=100,
        logging_dir="./logs",
        learning_rate=1e-5,
        save_total_limit=1,
        no_cuda=True,
    )



In [None]:
trainer = RewardTrainer(model=model,
                        tokenizer=tokenizer,
                        train_dataset=formatted_dataset['train'],
                        eval_dataset=formatted_dataset['test'],
                        args= training_args,
                        )
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss


TrainOutput(global_step=4, training_loss=0.7631033062934875, metrics={'train_runtime': 65.6396, 'train_samples_per_second': 0.107, 'train_steps_per_second': 0.061, 'total_flos': 0.0, 'train_loss': 0.7631033062934875, 'epoch': 1.0})

In [None]:
trainer.save_model("rm_model/")

In [None]:
import shutil
import os

source_dirs = ["/content/rm_model", "/content/rm_checkpoint"]
destination = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO"

os.makedirs(destination, exist_ok=True)

# Copy each directory to the destination
for src in source_dirs:
    if os.path.exists(src):
        dest_path = os.path.join(destination, os.path.basename(src))
        shutil.copytree(src, dest_path, dirs_exist_ok=True)  # Copy with merging existing directories
        print(f"Copied {src} to {dest_path}")
    else:
        print(f"Skipping {src}, does not exist.")

print("Copy operation completed.")


Copied /content/rm_model to /content/drive/MyDrive/Medical Dialogue Summarization using PPO/rm_model
Copied /content/rm_checkpoint to /content/drive/MyDrive/Medical Dialogue Summarization using PPO/rm_checkpoint
Copy operation completed.


In [None]:
## inference the model
rm_model = AutoModelForCausalLM.from_pretrained("rm_model/")
tokenizer = AutoTokenizer.from_pretrained("rm_model/")

In [None]:
def get_score(model, tokenizer, prompt, response):

    instructions = tokenizer.encode_plus(prompt,
                                       response,
                                       padding="max_length",
                                       max_length=256,
                                       return_tensors="pt",
                                        truncation=True)
    with torch.no_grad():
        outputs = model(**instructions)

    logits = outputs[0]

    return logits


In [None]:
# usage with prompt
prompt = df.iloc[0]["prompt"]
example_prefered_response = df.iloc[0]["chosen"]
example_unprefered_response = df.iloc[0]["rejected"]

In [None]:
loss1 = get_score(model, tokenizer, prompt, example_prefered_response)
loss2= get_score(model, tokenizer, prompt, example_unprefered_response)

In [None]:
from torch import nn
loss = -nn.functional.logsigmoid(loss1 - loss2).mean()

# Policy Model

In [1]:
import torch
import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from trl import RewardTrainer, SFTTrainer
from datasets import Dataset
import json
import pandas as pd
from transformers import Trainer, TrainingArguments
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model

  from .autonotebook import tqdm as notebook_tqdm


In [59]:
DATA = r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv"

df = pd.read_csv(DATA)

df.head(1)

Unnamed: 0,dataset,encounter_id,dialogue,note,source_file,id
0,virtassist,D2N168,"[doctor] hi diane , how are you ?\r\n[patient]...",CHIEF COMPLAINT\r\n\r\nEmergency room follow-u...,challenge_data\clef_taskC_test3.csv,


In [61]:
# Print column names
print("\nColumns in the combined DataFrame:")
print(df.columns.tolist())


Columns in the combined DataFrame:
['dataset', 'encounter_id', 'dialogue', 'note', 'source_file', 'id']


In [97]:
##model path
# MODEL_PATH = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rm_model"
MODEL_PATH = r"D:\kshitij-weights-folder\aloe-qwen-rl-trial-run"

# splits = {'train': 'data/train-00000-of-00001-3cbd295cedeecf91.parquet', 'test': 'data/test-00000-of-00001-0845e2eec675b16a.parquet', 'valid1': 'data/valid1-00000-of-00001-b647616a2be5f333.parquet', 'valid2': 'data/valid2-00000-of-00001-2655c5b3621b6116.parquet'}
# DATA_PATH = "hf://datasets/CarperAI/openai_summarize_comparisons/" + splits["test"]

DATA = r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv"

In [99]:
# df = pd.read_parquet(DATA_PATH)
df = pd.read_csv(DATA)
df = df[:1000]
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['dataset', 'encounter_id', 'dialogue', 'note', 'source_file', 'id'],
    num_rows: 464
})

In [101]:
sentiment_pipe_kwargs = {"top_k": None, "function_to_apply": "none"}

config = PPOConfig(
    model_name=MODEL_PATH, steps=51200, learning_rate=1.41e-5, remove_unused_columns=True
)

txt_in_len = 5
txt_out_len = 20
seed = 1

In [103]:
from transformers import AutoTokenizer, pipeline

In [105]:
dataset = dataset.rename_columns({"dialogue": "review"})
dataset = dataset.filter(lambda x: len(x["review"]) > 500, batched=False)
dataset = dataset.map(lambda x: {"review": x["review"][:1000]}, batched=False)

Filter: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 464/464 [00:00<00:00, 17195.09 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 464/464 [00:00<00:00, 9376.90 examples/s]


In [107]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [183]:
txt_in_len = 5
txt_out_len = 32
seed = 1

dataset = dataset.map(
    lambda x: {"input_ids": tokenizer.encode(" " + x["review"], return_tensors="pt", truncation=True, padding="max_length", max_length=200)[0]},
    batched=False,
)
dataset = dataset.map(lambda x: {"query": tokenizer.decode(x["input_ids"])}, batched=False)
dataset = dataset[:20480]
from datasets import Dataset

dataset = Dataset.from_dict(dataset)
dataset.set_format("pytorch")

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 464/464 [00:00<00:00, 671.68 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 464/464 [00:00<00:00, 1691.05 examples/s]


In [111]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# rf_model_path = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rm_model"
# rf_model_path = "/content/rm_model"
# starcoder_model = AutoModelForCausalLMWithValueHead.from_pretrained("/content/drive/MyDrive/Medical Dialogue Summarization using PPO/summarization_policy_new")  ##policy model from step 1
starcoder_model = AutoModelForCausalLMWithValueHead.from_pretrained(MODEL_PATH)
starcoder_model = starcoder_model.to(device)
# starcoder_model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(rf_model_path) ## reward model from step 2
# starcoder_model_ref = starcoder_model_ref.to(device)
starcoder_tokenizer = AutoTokenizer.from_pretrained("HPAI-BSC/Qwen2.5-Aloe-Beta-7B") ## tokenizer of step 1 model., here since we are using same model for step 1 and 2 it doesnot matter
starcoder_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

cuda


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:11<00:00,  2.81s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [113]:
dataset

Dataset({
    features: ['dataset', 'encounter_id', 'review', 'note', 'source_file', 'id', 'input_ids', 'query'],
    num_rows: 464
})

In [115]:
import torch
optimizer = torch.optim.SGD(starcoder_model.parameters(), lr=config.learning_rate)
ppo_trainer = PPOTrainer(config, starcoder_model, starcoder_model, starcoder_tokenizer, dataset=dataset, data_collator=collator, optimizer=optimizer)

In [45]:
from random import choices
from tqdm import tqdm
import time
import numpy as np

In [187]:
for batch in tqdm(ppo_trainer.dataloader):
    (logs, game_data,) = (
        dict(),
        dict(),
    )

    print(ctrl_str)
    #### prepend a random control token
    task_list = choices(ctrl_str, k=config.batch_size)
    game_data["query"] = [t + q for t, q in zip(task_list, batch["query"])]
    query_tensors = [torch.cat((ctrl_tokens[t], input_ids)) for t, input_ids in zip(task_list, batch["input_ids"])]

    for query in query_tensors:
        print(query)

  0%|                                                                                                                                                                                     | 0/1 [00:00<?, ?it/s]

['[negative]', '[positive]']
tensor([ 7669, 15060,    60,   508, 36983,    60, 16910,  1154,   773,  1039,
         1790,  8720,   374, 25769, 16940, 29025,   930,  1154,  2400,   315,
         7194,   220,    19,    14,    16,    17,    14,    16,    24,    18,
           19,   659, 17317,   659, 29025], device='cuda:0')
tensor([   58, 30487,    60,   508, 36983,    60, 50117,   319,    58, 22722,
           60,   498,  2299,  5527,  1101,   319,    58, 36983,    60,  5527,
          319,    58, 22722,    60, 15588,   595,   967,  1246,   525,   498,
         3351,   319,    58, 36983,    60], device='cuda:0')
tensor([ 7669, 15060,    60,   508, 36983,    60, 16910,   659,  1790,  8720,
          374, 23828,  3179, 84946,  1154,  2400,   315,  7194,   220,    16,
           17,    14,    17,    22,    14,    17,    15,    15,    21,   659,
        23828,  3179,   374,   264,   220], device='cuda:0')
tensor([ 7669, 15060,    60,   508, 36983,    60, 34209, 18701,   685,  1246,
        

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.62it/s]

tensor([   58, 30487,    60,   508, 22722,    60,  3116,  1578,  4014,   374,
         1588,   369, 16460,   315, 27800,  6646,   419,   374,   264,   220,
           20,    19,   481,  3157,  6284,  8593,   319,    58, 36983,    60,
         1246,  2299,   498,  3730, 10668], device='cuda:0')
tensor([ 7669, 15060,    60,   508, 36983,    60, 15588,   293, 14385,  1246,
          525,   498,  3730,   279,  6457, 17847,  3229,   752,   429,   498,
          614,   264, 39600,  3351,   319,    58, 22722,    60,   379,   747,
          600,  2776,  3730,  5020,  1661], device='cuda:0')
tensor([   58, 30487,    60,   508, 36983,    60, 34209,   326, 17416,  1661,
          311,  1490,   498,  3351,   773,  3330,  1588,   304,   847,  8388,
         5868,  1075,   498,   498,  1744,   498,   614,   264, 37942,  9798,
         1744,   498,  3003,  1030,  1105], device='cuda:0')
tensor([   58, 30487,    60,   508, 36983,    60,  1790,  8720,   374, 18701,
          685,   502, 19387,  1154, 1




In [189]:
print(game_data)

{'query': ['[negative] [doctor] okay , so our next patient is christopher watson , date of birth 4/12/1934 . mr . wat', "[positive] [doctor] alright\r\n[patient] you're ready just\r\n[doctor] ready\r\n[patient] hi kyle how are you today\r\n[doctor]", '[negative] [doctor] okay . next patient is ashley james , date of birth 12/27/2006 . ashley is a ', "[negative] [doctor] hey sophia how are you doing today\r\n[patient] i've been better my primary care doctor wanted me to see you because of this knee", '[negative] [doctor] so beverly is a 53 -year-old female with a recent diagnosis of stage three non-small cell lung cancer who presents for follow', '[negative] [doctor] so gloria is a 46 -year-old female today with past medical history of diabetes and back pain and today here for shortness of', '[negative] [doctor] mister thompson is a 67 -year-old male with a history of essential hypertension hyperlipidemia and osteoarthritis who', '[negative] [doctor] hi alan , how are you ?\r\n[patient]

In [179]:
import pandas as pd
from datasets import Dataset
import torch

# Collect all batches from the dataloader
all_data = []
for batch in ppo_trainer.dataloader:
    # Convert tensors to text
    texts = [ppo_trainer.tokenizer.decode(input_ids, skip_special_tokens=True) 
             for input_ids in batch["input_ids"]]
    
    # Convert tensors to lists properly
    input_ids_list = [tensor.cpu().numpy().tolist() for tensor in batch["input_ids"]]
    # attention_mask_list = [tensor.cpu().numpy().tolist() for tensor in batch["attention_mask"]]
    
    # Store batch data
    batch_data = {
        "text": texts,
        "input_ids": input_ids_list,
        # "attention_mask": attention_mask_list,
    }
    all_data.extend(batch_data["text"])  # Only extend texts if you want flat structure

# Save as JSON (now works)
df = pd.DataFrame(all_data)
# df.to_json("ppo_data.json", orient="records")
df.to_csv("ppo_data.csv", index=False)  # CSV format

In [None]:
for i in ppo_trainer.dataloader:
  print(i)
  break

In [119]:
ctrl_str = ["[negative]", "[positive]"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # this should be handled by accelerate
ctrl_tokens = dict((s, starcoder_tokenizer.encode(s, return_tensors="pt").squeeze().to(device)) for s in ctrl_str)


In [290]:
def pos_logit_to_reward(logit, task):
    """
    Take the positive sentiment logit and scale it for the task.
        task [negative]: reward = -logit
        task [neutral]: reward = -2*abs(logit)+4
        task [positive]: reward = logit
    """
    for i in range(len(logit)):
        if task[i] == "[negative]":
            pass
        elif task[i] == "[positive]":
            pass
        else:
            raise ValueError("task has to be in [0, 1, 2]!")
    return logit

In [292]:
pos_logit_to_reward(torch.Tensor([4, 4]), ctrl_str)

tensor([4., 4.])

In [125]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": starcoder_tokenizer.eos_token_id,
    "max_new_tokens": 32,
    "eos_token_id": -1,
}

In [304]:
def get_score(model, tokenizer, responses, game_data):
    # for i in responses:
    #     instructions = tokenizer.encode_plus(
    #                                        i,
    #                                        padding="max_length",
    #                                        max_length=32,
    #                                        return_tensors="pt")

    #     instructions = {key: value.to(device) for key, value in instructions.items()}

    #     with torch.no_grad():
    #         outputs = model(**instructions)

    #     logits = outputs[0].mean()
    #     positive_logist.append(logits)
    
    weights_array = np.array([0.1, 0.2, 0.3]) #'coherence', 'consistency', 'fluency'

    sample_data = []

    for q,r in zip(game_data["query"], game_data["response"]):
        temp = {}
    
        temp["source"] = q
        temp["system_output"] = r
    
        sample_data.append(temp)
    
    score = evaluate(sample_data, overall=False)

    weighted_score = []

    for array1 in score:
        result = np.where(
            array1 < 0.5,          # Condition
            -array1 * weights,     # If True: make product negative
            array1 * weights       # If False: normal multiplication
        )
        
        sum_products = np.sum(result)
        final_result = sum_products/3
        
        weighted_score.append(final_result)

    scores = [torch.tensor([x], dtype=torch.float64) for x in weighted_score]

    return scores

In [None]:
# responses =["ashish is a goo", "heelow how are you", "__IT_\nr/\n: r RelationshipRelationship]]0]\nlsriend\n2//M]\n [ [ a\n the was to the [. a friends to\n\n:\n [lfriend [ me have a aried in his19 minutes.\n\nWhat Modified:** girlfriend was through the Facebook.. I my my friends.**** my  of lf**\n\n** was d1ing for my few personirl** I had for findoolpping my my the future** but I was that in\n\n** have ali  of to she  tolirt my me girl. and she found my about my.. me few of gir.1viously). was\'t find her was).\n\n** was it about my twoirl and the had  Facebook. the  and she gand historyirl) was in April,\n to, find, were flirted. I a messages.. f.ing on her.\n girlM\n; I1 girirllfriend and the19 months. to my Facebook.. my permission. she her messages. my.lirty with my fewirl.\n found her with me. I through more with\n"]
# get_score(starcoder_model, tokenizer, responses)

In [129]:
# Check if models are on GPU
print(next(starcoder_model.parameters()).device)  # Should print: cuda:0
# print(next(starcoder_model_ref.parameters()).device)  # Should print: cuda:0

cuda:0


In [306]:
from random import choices
from tqdm import tqdm
import time
import numpy as np

for epoch in range(1):
    for batch in tqdm(ppo_trainer.dataloader):
        (logs, game_data,) = (
            dict(),
            dict(),
        )

        print(ctrl_str)
        #### prepend a random control token
        task_list = choices(ctrl_str, k=config.batch_size)
        game_data["query"] = [t + q for t, q in zip(task_list, batch["query"])]
        query_tensors = [torch.cat((ctrl_tokens[t], input_ids)) for t, input_ids in zip(task_list, batch["input_ids"])]

        #### get response from gpt2
        response_tensors = []
        for query in query_tensors:
            response = ppo_trainer.generate(query, **generation_kwargs)
            response_tensors.append(response.squeeze()[-txt_out_len:])
#         print(response_tensors)
        game_data["response"] = [starcoder_tokenizer.decode(r.squeeze()) for r in response_tensors]

        print("check")

        #### sentiment analysis
        texts = [q + r for q, r in zip(batch["query"], game_data["response"])]
        logits = get_score(starcoder_model,starcoder_tokenizer, texts, game_data)
        rewards = pos_logit_to_reward(logits, task_list)
        # rewards = [torch.tensor([1.0], device=query_tensors[0].device) for _ in range(len(texts))]

        #### Run PPO training
        t = time.time()
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)

        for cs in ctrl_str:
            key = "env/reward_" + cs.strip("[]")
            stats[key] = np.mean([r.cpu().numpy() for r, t in zip(rewards, task_list) if t == cs])
        ppo_trainer.log_stats(stats, game_data, rewards)

  0%|                                                                                                                                                                                     | 0/1 [00:00<?, ?it/s]

['[negative]', '[positive]']
check
Evaluating coherence of 256 samples !!!



  0%|                                                                                                                                                                                    | 0/32 [00:00<?, ?it/s][A
  9%|████████████████▏                                                                                                                                                           | 3/32 [00:00<00:01, 24.77it/s][A
 19%|████████████████████████████████▎                                                                                                                                           | 6/32 [00:00<00:01, 21.10it/s][A
 28%|████████████████████████████████████████████████▍                                                                                                                           | 9/32 [00:00<00:01, 21.58it/s][A
 38%|████████████████████████████████████████████████████████████████▏                                                                                 

Evaluating consistency of 256 samples !!!



  0%|                                                                                                                                                                                    | 0/72 [00:00<?, ?it/s][A
  4%|███████▏                                                                                                                                                                    | 3/72 [00:00<00:02, 25.65it/s][A
  8%|██████████████▎                                                                                                                                                             | 6/72 [00:00<00:02, 22.05it/s][A
 12%|█████████████████████▌                                                                                                                                                      | 9/72 [00:00<00:02, 23.56it/s][A
 17%|████████████████████████████▌                                                                                                                     

Evaluating fluency of 256 samples !!!



  0%|                                                                                                                                                                                    | 0/72 [00:00<?, ?it/s][A
  6%|█████████▌                                                                                                                                                                  | 4/72 [00:00<00:01, 39.13it/s][A
 11%|███████████████████                                                                                                                                                         | 8/72 [00:00<00:01, 33.20it/s][A
 17%|████████████████████████████▌                                                                                                                                              | 12/72 [00:00<00:01, 30.25it/s][A
 22%|██████████████████████████████████████                                                                                                            

In [141]:
###saving the model
# starcoder_model.save_pretrained("rhlfmodel/")
# starcoder_tokenizer.save_pretrained("rhlfmodel/")

ppo_trainer.model.pretrained_model.save_pretrained("/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel/")
starcoder_tokenizer.save_pretrained("/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel/")



('/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel/tokenizer_config.json',
 '/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel/special_tokens_map.json',
 '/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel/vocab.json',
 '/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel/merges.txt',
 '/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel/added_tokens.json',
 '/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel/tokenizer.json')

In [None]:
import shutil
import os

source_dirs = ["/content/rhlfmodel"]
destination = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO"

os.makedirs(destination, exist_ok=True)

# Copy each directory to the destination
for src in source_dirs:
    if os.path.exists(src):
        dest_path = os.path.join(destination, os.path.basename(src))
        shutil.copytree(src, dest_path, dirs_exist_ok=True)  # Copy with merging existing directories
        print(f"Copied {src} to {dest_path}")
    else:
        print(f"Skipping {src}, does not exist.")

print("Copy operation completed.")


Copied /content/rhlfmodel to /content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel
Copy operation completed.


In [None]:
from transformers import pipeline, set_seed
model_path = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel"
set_seed(42)
pipe = pipeline("text-generation",model=model_path, tokenizer=model_path, max_length=40, num_return_sequences=1)

In [None]:
text = dataset["rejected"][0]
print(text)
pipe(text)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


TL;DR:  My girlfriend and I broke up after she went through my Facebook account without my permission.<|endoftext|>Citizens for the Republic


[{'generated_text': 'TL;DR:  My girlfriend and I broke up after she went through my Facebook account without my permission.<|endoftext|>Citizens for the Republic'}]

In [None]:
save_directory = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rhlfmodel"

# Load the model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(save_directory).to(device)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

In [None]:
conversation = '''
Doctor: Hi, Mr. X, I'm Dr. Y. How are you feeling today?

Patient: Not too good, doctor. I've been feeling really sick lately.

Doctor: I understand. Can you tell me what symptoms you're experiencing?

Patient: Yes, I've been having a fever, a dry cough, and dyspnea.

Doctor: I see. You were hospitalized due to moderate ARDS from COVID-19, is that correct?

Patient: Yes, that's correct.

Doctor: During your physical therapy, we encountered some difficulties. Can you tell me more about that?

Patient: Yes, I had trouble with position changes and deep breathing. Every time I tried to change my position or take a deep breath, I would start coughing and it would make me really short of breath.

Doctor: I understand. To avoid rapid deterioration and respiratory failure, we instructed you to change positions very slowly and step-by-step, right?

Patient: Yes, that's right. It took about 30 minutes to change to the prone position.

Doctor: And I see that this approach increased your oxygen saturation, for example, on day 5 with 6 L/min of oxygen from 93% to 97%.

Patient: Yes, that's correct.

Doctor: Good. We also had to adapt your breathing exercises to avoid prolonged coughing and oxygen desaturation. Can you tell me more about that?

Patient: Yes, I was instructed to stop every deep breath before coughing and to hold my breath for better air distribution.

Doctor: I see that you performed the breathing exercises well and managed to increase your oxygen saturation.

Patient: Yes, I did my best.

Doctor: You also had difficulty maintaining sufficient oxygen saturation during physical activity, is that correct?

Patient: Yes, I did. But with close monitoring and frequent breaks, I was able to perform low-level strength and walking exercises without any significant deoxygenation.

Doctor: I see that your exercise progression was low on days 1 to 5, but then increased daily until your hospital discharge to a rehabilitation clinic on day 10.

Patient: Yes, that's correct.

Doctor: Great. I'd like to keep monitoring your progress and see how you're doing. Can you keep me updated on any changes in your symptoms?

Patient: Yes, of course, doctor.

Doctor: Alright, let's keep in touch. If you have any questions or concerns, don't hesitate to reach out to me.

Patient: Thank you, doctor.
'''

In [None]:
def generate_response(prompt, model, tokenizer, max_new_tokens=1000, temperature=0.1):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            temperature=temperature,
            top_k=50,
            top_p=0.9,
            do_sample=True,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id
        )


    return tokenizer.decode(output[0], skip_special_tokens=True)

prompt = "Generate a summary for the below conversation. Dont give me the prompt back. I just want the summary to be returned to me\n\n" + conversation
response = generate_response(prompt, model, tokenizer)
print("Generated Response:\n", response)


Generated Response:
 Generate a summary for the below conversation. Dont give me the prompt back. I just want the summary to be returned to me


Doctor: Hi, Mr. X, I'm Dr. Y. How are you feeling today?

Patient: Not too good, doctor. I've been feeling really sick lately.

Doctor: I understand. Can you tell me what symptoms you're experiencing?

Patient: Yes, I've been having a fever, a dry cough, and dyspnea.

Doctor: I see. You were hospitalized due to moderate ARDS from COVID-19, is that correct?

Patient: Yes, that's correct.

Doctor: During your physical therapy, we encountered some difficulties. Can you tell me more about that?

Patient: Yes, I had trouble with position changes and deep breathing. Every time I tried to change my position or take a deep breath, I would start coughing and it would make me really short of breath.

Doctor: I understand. To avoid rapid deterioration and respiratory failure, we instructed you to change positions very slowly and step-by-step, right?

Pat

In [None]:
print(dataset["review"][0][:100])

SUBREDDIT: r/relationships
TITLE: My [21/M] girlfriend [19/F] broke up with me after she went throug


In [None]:
model = AutoModelForCausalLM.from_pretrained("rlhfmodel/")
model_path = "bigcode/tiny_starcoder_py"

tokenizer = AutoTokenizer.from_pretrained(model_path, truncation=True, max_length=256, padding="max_length")
text = df.iloc[2]["prompt"]
tokenized_text = tokenizer(text, return_tensors="pt", max_length=256)

In [163]:
from prettytable import PrettyTable

def convert_to_json(output_list, src_list=None, ref_list=None, context_list=None, \
            scores=None, doc_id=None, system_id=None):
    """
        Convert the data into the json format.

        output_list: a list of model output
        src_list: source input for different NLG tasks. For example, source document for summarization
                  and dialogue history for dialogue response generation
        ref_list: human-annotated groundtruth
        context_list: the context needed to evaluate several specific dimension. For example,
                      additional factual information when evaluating engagingness and groundedness in dialogues
        scores: human scores for evaluating the model output. They can be used to calculate the correlation
                between evaluators and human judgements. The scores should be stored in a dictionary. For example,
                {'fluency': 2.0, 'coherence': 3.0} could be the human score for a sample.
        doc_id: the index of the input source. It can be used to calculate summary-level correlation for summarzation
        system_id: the index of the generation system. It can be used to calculate system-level correlation.
    """
    json_data = []
    for i in range(len(output_list)):
        cur = {}
        cur['system_output'] = output_list[i]
        if src_list is not None:
            cur['source'] = src_list[i]
        if ref_list is not None:
            cur['reference'] = ref_list[i]
        if context_list is not None:
            cur['context'] = context_list[i]
        if scores is not None:
            cur['scores'] = scores[i]
        if doc_id is not None:
            cur['doc_id'] = doc_id[i]
        if system_id is not None:
            cur['system_id'] = system_id[i]
        json_data.append(cur)
    return json_data


def add_question(dimension, output, src=None, ref=None, context=None, task=None):
    """
        Add questions to generate input in Bool-QA format for UniEval.

        dimension: specific dimension to be evaluated
        src: source input for different NLG tasks. For example, source document for summarization
             and dialogue history for dialogue response generation.
        output: output text generated by the models
        ref: human-annotataed groundtruth
        context: the context needed to evaluate several specific dimension. For example,
                 additional factual information when evaluating engagingness and groundedness in dialogues.
    """

    input_with_question = []
    for i in range(len(output)):
        # For summarization
        if task == 'summarization':
            if dimension == 'fluency':
                cur_input = 'question: Is this a fluent paragraph? </s> paragraph: ' + output[i]
            elif dimension == 'coherence':
                cur_input = 'question: Is this a coherent summary to the document? </s> summary: ' + output[i] + ' </s> document: ' + src[i]
            elif dimension == 'consistency':
                cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[i] + ' </s> document: ' + src[i]
            elif dimension == 'relevance':
                cur_input = 'question: Is this summary relevant to the reference? </s> summary: ' + output[i] + ' </s> reference: ' + ref[i]
            else:
                raise NotImplementedError('The input format for this dimension is still undefined. Please customize it first.')
        # For dialogues
        elif task == 'dialogue':
            if dimension == 'naturalness':
                cur_input = 'question: Is this a natural response in the dialogue? </s> response: ' + output[i]
            elif dimension == 'coherence':
                cur_input = 'question: Is this a coherent response given the dialogue history? </s> response: '\
                            + output[i] + ' </s> dialogue history: ' + src[i]
            elif dimension == 'engagingness':
                cur_input = 'question: Is this an engaging and informative response according to the dialogue history and fact? </s> response: '\
                            + output[i] + ' </s> dialogue history: ' + src[i] + ' </s> fact: ' + context[i]
            elif dimension == 'groundedness':
                cur_input = 'question: Is this response consistent with knowledge in the fact? </s> response: '\
                            + output[i] + ' </s> fact: ' + context[i]
            elif dimension == 'understandability':
                cur_input = 'question: Is this an understandable response in the dialogue? </s> response: ' + output[i]
            else:
                raise NotImplementedError('The input format for this dimension is still undefined. Please customize it first.')
        # For data-to-text
        elif task == 'data2text':
            if dimension == 'naturalness':
                cur_input = 'question: Is this a fluent utterance? </s> utterance: ' + output[i]
            elif dimension == 'informativeness':
                cur_input = 'question: Is this sentence informative according to the reference? </s> sentence: '\
                            + output[i] + ' </s> reference: ' + ref[i]
            else:
                raise NotImplementedError('The input format for this dimension is still undefined. Please customize it first.')
        # For factual consistency detection
        elif task == 'fact':
            if dimension == 'consistency':
                cur_input = 'question: Is this claim consistent with the document? </s> claim: ' + output[i] + ' </s> document: ' + src[i]
            else:
                raise NotImplementedError('No other dimensions for the factual consistency detection task.')
        # For new customized tasks
        else:
            raise NotImplementedError('Other tasks are not implemented, please customize specific tasks here.')
        input_with_question.append(cur_input)
    return input_with_question


def print_scores(scores):
    table = PrettyTable(['Dimensions','Score'])
    print('\nEvaluation scores are shown below:')
    dims = list(scores[0].keys())
    for dim in dims:
        cur_score = 0
        for i in range(len(scores)):
            cur_score += scores[i][dim]
        table.add_row([dim, round(cur_score / len(scores), 6)])
    print(table)

In [262]:
import numpy as np
from nltk import sent_tokenize
from scorer import UniEvaluator  # Make sure this import works after placing scorer.py in the same directory

def evaluate(data, dims=None, overall=True, print_result=False, model_name_or_path="t5-small", task='summarization', device='cuda:0', individual=True):
    """
    Get the scores of all the given dimensions (fluency, consistency, coherence, relevance)

    data: A list of dictionaries, where each dictionary contains:
          - 'source': The original text
          - 'system_output': The generated system output (summary)
          - 'reference' (optional): Reference summary for relevance evaluation

    dims: A list of dimensions to be evaluated. If dims is None, it evaluates four default dimensions:
          coherence, consistency, fluency, relevance.

    overall: Boolean to indicate whether the overall score is calculated as the average of all dimensions.

    print_result: Boolean to print the results on the screen.

    model_name_or_path: The model name or path to use for evaluation, e.g., 't5-small'

    task: The task type (used in scoring if needed, like summarization or other NLP tasks).

    device: The device to use for evaluation ('cpu' or 'cuda:0').
    """

    # Instantiate the scorer
    scorer = UniEvaluator(model_name_or_path=model_name_or_path, device=device)

    n_data = len(data)
    eval_scores = [{} for _ in range(n_data)]

    # Default dimensions if not provided
    if dims is None:
        dims = ['coherence', 'consistency', 'fluency']   #add relevance

    for dim in dims:
        print(f'Evaluating {dim} of {n_data} samples !!!')

        if dim == 'consistency' or dim == 'fluency':
            # Sentence-level scores for consistency and fluency
            src_list, output_list = [], []
            n_sents = []  # number of sentences in each summary

            for i in range(n_data):
                if dim == 'consistency':
                    source = data[i]['source']
                else:
                    source = ''
                system_outputs = sent_tokenize(data[i]['system_output'])
                n_sents.append(len(system_outputs))
                for j in range(len(system_outputs)):
                    src_list.append(source)
                    output_list.append(system_outputs[j])

            input_list = add_question(dimension=dim, output=output_list, src=src_list, task=task)
            sent_score = scorer.score(input_list)

            # Calculate average sentence-level scores for each sample
            start_idx = 0
            score = []
            for cur_n_sent in n_sents:
                score.append(sum(sent_score[start_idx:start_idx + cur_n_sent]) / cur_n_sent)
                start_idx += cur_n_sent

        elif dim == 'coherence' or dim == 'relevance':
            # Summary-level scores for coherence and relevance
            src_list, output_list, ref_list = [], [], []

            for i in range(n_data):
                src_list.append(data[i]['source'])
                output_list.append(data[i]['system_output'])
                if dim == 'relevance':
                    ref_list.append(data[i]['reference'])

            input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=task)
            score = scorer.score(input_list)

        else:
            raise NotImplementedError(f"The input format for the dimension '{dim}' is still undefined. Please customize it.")

        # Store the scores for the current dimension
        for i in range(n_data):
            eval_scores[i][dim] = score[i]

    # Calculate overall score (average of all evaluated dimensions)
    if overall:
        for i in range(n_data):
            eval_scores[i]['overall'] = np.mean([eval_scores[i][dim] for dim in dims])

    # Print the result if requested
    if print_result:
        print_scores(eval_scores)

    if individual:
        individual_scores = []
        for i in range(n_data):
            temp = [eval_scores[i][dim] for dim in dims]
            individual_scores.append(temp)

        return np.array(individual_scores)

    # Calculate average score across all the dimensions except 'overall'
    avg_score = []
    for i in range(n_data):
        # Exclude 'overall' from the averaging
        dimensions = [dim for dim in dims if dim != 'overall']
        avg_score.append(np.mean([eval_scores[i][dim] for dim in dimensions]))

    return avg_score


In [302]:
data = [
    {
        'source': "Doctor: Hello, how are you feeling today?\nPatient: I've been feeling a bit tired and dizzy.\nDoctor: How long has this been happening?\nPatient: For about a week now. I also have trouble sleeping.\nDoctor: I see. Have you been under a lot of stress lately?\nPatient: Yes, work has been quite stressful.\nDoctor: That could be contributing. Let’s do some tests to rule out other issues.",
        'system_output': "Patient reports tiredness, dizziness, and difficulty sleeping for a week. Work-related stress may be a factor. Doctor will conduct tests to check for other problems."
    },
    {
        'source': "Doctor: What brings you in today?\nPatient: I’ve been having some chest pain and shortness of breath.\nDoctor: How severe is the pain?\nPatient: It’s sharp, and it comes and goes.\nDoctor: When did it start?\nPatient: It started two days ago.\nDoctor: Any history of heart problems?\nPatient: Yes, my father had heart disease.\nDoctor: We’ll need to do an ECG and some blood tests to check your heart health.",
        'system_output': "Patient has sharp chest pain and shortness of breath for two days. Family history of heart disease. Doctor will perform an ECG and blood tests to assess heart health."
    },
    {
        'source': "Doctor: How are you feeling today?\nPatient: I’ve had a sore throat and a cough for the past few days.\nDoctor: Any fever or difficulty swallowing?\nPatient: Yes, I’ve had a low fever, but swallowing is fine.\nDoctor: Any history of allergies or similar symptoms?\nPatient: Not really.\nDoctor: It could be a viral infection. I recommend rest, fluids, and maybe some over-the-counter medicine.",
        'system_output': "Patient reports sore throat, cough, and a low fever. Doctor advises rest, fluids, and over-the-counter medication as the symptoms suggest a viral infection."
    },
    {
        'source': "Doctor: What’s bothering you today?\nPatient: I’ve been experiencing frequent headaches and some nausea.\nDoctor: How often do you get the headaches?\nPatient: It’s been almost every day for the past week.\nDoctor: Any other symptoms like blurred vision or dizziness?\nPatient: No, just the headache and nausea.\nDoctor: We’ll schedule an MRI to get a better understanding of the issue.",
        'system_output': "Patient complains of daily headaches and nausea for the past week. No blurred vision or dizziness. Doctor will schedule an MRI for further evaluation."
    }
]
score = evaluate(data, print_result=True)
print(score)

weights = np.array([1, 2, 3]) #'coherence', 'consistency', 'fluency'
weighted_score = []

for array1 in score:
    result = np.where(
        array1 < 0.5,          # Condition
        -array1 * weights,     # If True: make product negative
        array1 * weights       # If False: normal multiplication
    )
    
    sum_products = np.sum(result)
    final_result = sum_products/3
    
    weighted_score.append(final_result)

        
print(weighted_score)

scores = [torch.tensor([x], dtype=torch.float64) for x in weighted_score]
print(scores)  # Output: torch.float64

Evaluating coherence of 4 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.15it/s]


Evaluating consistency of 4 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 13.93it/s]


Evaluating fluency of 4 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 19.84it/s]


Evaluation scores are shown below:
+-------------+----------+
|  Dimensions |  Score   |
+-------------+----------+
|  coherence  | 0.449252 |
| consistency | 0.64936  |
|   fluency   | 0.385504 |
|   overall   | 0.494705 |
+-------------+----------+
[[0.76403181 0.84311455 0.48589937]
 [0.45844862 0.71566303 0.44122299]
 [0.55973144 0.72223429 0.35135146]
 [0.0147952  0.31642992 0.26354302]]
[np.float64(0.3308542634136033), np.float64(-0.11693050848950663), np.float64(0.3167152151413462), np.float64(-0.4794280308898093)]
[tensor([0.3309], dtype=torch.float64), tensor([-0.1169], dtype=torch.float64), tensor([0.3167], dtype=torch.float64), tensor([-0.4794], dtype=torch.float64)]





In [238]:
sample_data = []

for q,r in zip(game_data["query"], game_data["response"]):
    temp = {}

    temp["source"] = q
    temp["system_output"] = r

    sample_data.append(temp)

    break

print(sample_data)

[{'source': '[positive] [doctor] karen is a 34 -year-old female with a history of chronic migraines and hypertension who is here today with abdominal pain so hi', 'system_output': " dr. Ochsner, nice to meet you.\n\nDr. Ochsner: good to meet you as well. It's great to see you back in"}]


In [264]:
score = evaluate(sample_data, print_result=True, overall=False)

Evaluating coherence of 1 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.30it/s]


Evaluating consistency of 1 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.85it/s]


Evaluating fluency of 1 samples !!!


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 22.64it/s]


Evaluation scores are shown below:
+-------------+----------+
|  Dimensions |  Score   |
+-------------+----------+
|  coherence  | 0.702612 |
| consistency | 0.684789 |
|   fluency   | 0.566372 |
+-------------+----------+





In [270]:
score

array([[0.70261155, 0.68478878, 0.56637162]])

In [268]:
print(type(score[0][0]))

<class 'numpy.float64'>


In [280]:
weights = np.array([1, 2, 3]) #'coherence', 'consistency', 'fluency'
weighted_score = []

for array1 in score:
    result = np.where(
        array1 < 0.5,          # Condition
        -array1 * weights,     # If True: make product negative
        array1 * weights       # If False: normal multiplication
    )
    
    sum_products = np.sum(result)
    final_result = sum_products/3
    
    weighted_score.append(final_result)

        
print(weighted_score)

[np.float64(1.2571013210531137)]
