In [6]:
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cu118


In [1]:
import torch
print(torch.cuda.is_available())  # Should print True if CUDA is available
print(torch.version.cuda)         # Should print the installed CUDA version
print(torch.backends.cudnn.enabled)  # Should be True if cuDNN is enabled

True
11.8
True


In [3]:
!nvidia-smi

Tue Mar 18 17:44:44 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 572.61                 Driver Version: 572.61         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A6000             WDDM  |   00000000:47:00.0 Off |                  Off |
| 30%   35C    P8             21W /  300W |     499MiB /  49140MiB |      3%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
import torch
import bitsandbytes as bnb
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Fri_Feb_21_20:42:46_Pacific_Standard_Time_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0


In [9]:
import torch
import bitsandbytes as bnb
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset


# 1) 4-bit quant config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# 2) Load base model in 4-bit
base_model = AutoModelForCausalLM.from_pretrained(
    "HPAI-BSC/Qwen2.5-Aloe-Beta-7B",
    quantization_config=bnb_config,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained("HPAI-BSC/Qwen2.5-Aloe-Beta-7B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# 3) Apply LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], 
)
model = get_peft_model(base_model, lora_config)

# 4) Enable gradient checkpointing
model.enable_input_require_grads()
model.gradient_checkpointing_enable()
base_model.config.use_cache = False

# 5) Prepare data with smaller sequence length
notechat = load_dataset("akemiH/NoteChat")
trainB = notechat["train"].select(range(3000))
evalB = notechat["train"].select(range(3000, 3500))

MAX_LENGTH = 256  # smaller sequence length

class ConversationSummaryDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset, tokenizer, max_length=MAX_LENGTH):
        self.data = hf_dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        conversation = str(self.data[idx]["conversation"])
        summary = str(self.data[idx]["data"])
        text = f"CONVERSATION:\n{conversation}\n\nSUMMARY: {summary}"

        encodings = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )
        input_ids = encodings["input_ids"].squeeze()
        attention_mask = encodings["attention_mask"].squeeze()

        labels = input_ids.clone()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

train_dataset = ConversationSummaryDataset(trainB, tokenizer)
eval_dataset = ConversationSummaryDataset(evalB, tokenizer)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards: 100%|█████████████████████████████████████████████████████| 4/4 [08:45<00:00, 131.43s/it]
Loading checkpoint shards: 100%|███████████████████████████████████████████████| 4/4 [00:11<00:00,  2.77s/it]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Repo card metadata block was not found. Setting CardData to empty.
Generating train split: 100%|██████████████████████████████| 207001/207001 [00:13<00:00, 14997.44 examples/s]


In [11]:
# 6) Training arguments
training_args = TrainingArguments(
    output_dir="./aloe_qwen",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,      # accumulate to compensate for small batch
    evaluation_strategy="epoch",       # you can skip or reduce evaluation
    fp16=True,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
    dataloader_pin_memory=True,
    dataloader_drop_last=True,
    gradient_checkpointing=True,       # from the Trainer side as well
)

# 7) Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# 8) Train
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,1.1633,1.149429
2,1.1226,1.14094


TrainOutput(global_step=750, training_loss=1.163391393025716, metrics={'train_runtime': 3049.7956, 'train_samples_per_second': 1.967, 'train_steps_per_second': 0.246, 'total_flos': 6.5209332400128e+16, 'train_loss': 1.163391393025716, 'epoch': 2.0})

In [13]:
model.save_pretrained("./aloe_qwen")
tokenizer.save_pretrained("./aloe_qwen")

('./aloe_qwen\\tokenizer_config.json',
 './aloe_qwen\\special_tokens_map.json',
 './aloe_qwen\\vocab.json',
 './aloe_qwen\\merges.txt',
 './aloe_qwen\\added_tokens.json',
 './aloe_qwen\\tokenizer.json')

In [3]:
import torch
import bitsandbytes as bnb
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from peft import PeftModel

# -- Path to your LoRA weights + tokenizer --
model_dir = "./aloe_qwen"  

# -- 4-bit quantization config (same as training) --
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# -- 1) Load the *base* Qwen2.5 model in 4-bit --
base_model = AutoModelForCausalLM.from_pretrained(
    "HPAI-BSC/Qwen2.5-Aloe-Beta-7B",
    quantization_config=bnb_config,
    device_map="auto",
)

# -- 2) Load your fine-tuned LoRA adapters into the base model --
# The directory should contain adapter_model.bin, adapter_config.json, etc.
lora_model = PeftModel.from_pretrained(
    base_model, 
    model_dir,
    device_map="auto",
)

# -- 3) Load the tokenizer you saved to ./aloe_qwen --
tokenizer = AutoTokenizer.from_pretrained(model_dir)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"


Loading checkpoint shards: 100%|███████████████████████████████████████████████| 4/4 [00:11<00:00,  2.93s/it]


In [5]:
from transformers import pipeline

# 1) Create a conversation + summary prompt
conversation_text = """
Doctor: Hi, Mr. X, I'm Dr. Y. How are you feeling today?
Patient: Not too good, doctor. I've been feeling really sick lately.
Doctor: I understand. Can you tell me what symptoms you're experiencing?
Patient: Yes, I've been having a fever, a dry cough, and dyspnea.
Doctor: I see. You were hospitalized due to moderate ARDS from COVID-19, is that correct?
Patient: Yes, that's correct.
Doctor: During your physical therapy, we encountered some difficulties. Can you tell me more about that?
Patient: Yes, I had trouble with position changes and deep breathing. Every time I tried to change my position or take a deep breath, I would start coughing and it would make me really short of breath.
Doctor: I understand. To avoid rapid deterioration and respiratory failure, we instructed you to change positions very slowly and step-by-step, right?
Patient: Yes, that's right.
Doctor: And I see that this approach increased your oxygen saturation, for example, on day 5 with 6 L/min of oxygen from 93% to 97%.
Patient: Yes, that's correct.
Doctor: Good. We also had to adapt your breathing exercises to avoid prolonged coughing and oxygen desaturation. Can you tell me more about that?
Patient: Yes, I was instructed to stop every deep breath before coughing and to hold my breath for better air distribution.
Doctor: I see that you performed the breathing exercises well and managed to increase your oxygen saturation.
Patient: Yes, I did my best.
Doctor: You also had difficulty maintaining sufficient oxygen saturation during physical activity, is that correct?
Patient: Yes, I did. But with close monitoring and frequent breaks, I was able to perform low-level strength and walking exercises without any significant deoxygenation.
Doctor: I see that your exercise progression was low on days 1 to 5, but then increased daily until your hospital discharge to a rehabilitation clinic on day 10.
Patient: Yes, that's correct.
Doctor: Great. I'd like to keep monitoring your progress and see how you're doing. Can you keep me updated on any changes in your symptoms?
Patient: Yes, of course, doctor.
Doctor: Alright, let's keep in touch. If you have any questions or concerns, don't hesitate to reach out to me.
Patient: Thank you, doctor.
"""
prompt = f"""
CONVERSATION:
{conversation_text}

SUMMARY:
- **Symptoms**:
  (Include symptoms mentioned in the conversation, if any.)

- **Physical Activities**:
  (Include details about physical activities or excercise habits mentioned in the conversation, if any.)

- **Medical History**:
  (Include any relavent medical history mentioned in the conversation, if any.)

- **Surgical History**
  (Include any details about past surgeries mentioned in the conversation, if any.)

- **Treatment Plan History**
  (Include any details about past treatment plans or suggestions made by healthcare professionals, including medications, surgeries, therapies, or lifestyle changes, if any.)

Provide the summary in plain text with the above attributes, and include information only if it is mentioned in the conversation. If any attribute is not discussed, omit it. Only include the summary in the generated response.Format the summary properly.\n\n
"""

# Create a text-generation pipeline
# (You can name it 'summarizer' if you wish, but it's using text-generation.)
text_gen_pipeline = pipeline(
    "text-generation",
    model=lora_model,
    tokenizer=tokenizer,
    device_map="auto",   # place on GPU if available
)
# Now generate text using the pipeline
response = text_gen_pipeline(
    prompt,
    max_new_tokens=400,
    do_sample=False,  # Deterministic for demonstration
)

# The output is a list of dict(s); each has "generated_text"
generated_text = response[0]["generated_text"]
# print("FULL OUTPUT:\n", generated_text)

# If the model replicates your training format, you'll see something like:
# "CONVERSATION: ... SUMMARY: <the actual summary text>"
# You can strip off everything before "SUMMARY:" if you only want the summary.
if "Summary:" in generated_text:
    final_summary = generated_text.split("Summary:", 1)[-1].strip()
else:
    final_summary = generated_text

print("\nEXTRACTED SUMMARY:\n", final_summary)


Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausa


EXTRACTED SUMMARY:
 - **Symptoms**: Fever, dry cough, dyspnea
- **Physical Activities**: Position changes, deep breathing, breathing exercises, strength and walking exercises
- **Medical History**: No relevant information provided
- **Surgical History**: No relevant information provided
- **Treatment Plan History**: Physical therapy, gradual position changes, breathing exercises adapted to avoid coughing and oxygen desaturation, low-level strength and walking exercises with frequent breaks to prevent deoxygenation. Exercise progression increased daily until hospital discharge to a rehabilitation clinic. Doctor will continue to monitor patient's progress and update on any changes in symptoms. 1. Patient presented with symptoms of fever, dry cough, and dyspnea.
2. He was diagnosed with moderate ARDS from COVID-19 and hospitalized.
3. During physical therapy, he experienced difficulties with position changes and deep breathing, leading to coughing and shortness of breath.
4. To avoid rap

In [55]:
conversation_text = """
Doctor: Hello, I am Dr. Smith. Can you tell me what brings you to the hospital today?
Patient: Yes, I have been feeling very weak and sick for the past two weeks. I have a persistent fever and dry cough.
Doctor: I see. And how is your breathing?
Patient: It's been shallow and rapid, especially when I am at rest. And I get severely breathless even with minor physical activities.
Doctor: Okay. I understand. You were given physical therapy, right?
Patient: Yes, they focused on educating me about dyspnea-relieving positions and the importance of regular mobilization and deep-breathing exercises.
Doctor: That's good. And how did it go?
Patient: It became evident that my anxiety from fear of dying and worries about my future was making my dyspnea worse. I was so dyspneic, anxious, and weak that I could barely walk to the toilet.
Doctor: I see. But your physical therapist helped you with that, right?
Patient: Yes, they listened to me, explained why I was experiencing breathlessness, and tested suitable positions to relieve my dyspnea. I felt better after the education and relaxing breathing exercises.
Doctor: That's great to hear. Can you tell me more about the improvement?
Patient: On day 2, my respiratory rate reduced from 30 breaths/min to 22 breaths/min and my oxygen saturation increased from 92% to 96% on 4 L/min oxygen after some deep-breathing exercises.
Doctor: That's impressive. And how did it go after that?
Patient: My dyspnea and anxiety started to alleviate and I regained my self-confidence. The therapy was shifted to walking and strength training, and I was able to walk 350 m without a walking aid or supplemental oxygen before my discharge home.
Doctor: That's fantastic. You have made a great progress. I am glad to hear that.
"""
prompt = f"""
CONVERSATION:
{conversation_text}

SUMMARY:
- **Symptoms**:
  (Include symptoms mentioned in the conversation, if any.)

- **Physical Activities**:
  (Include details about physical activities or excercise habits mentioned in the conversation, if any.)

- **Medical History**:
  (Include any relavent medical history mentioned in the conversation, if any.)

- **Surgical History**
  (Include any details about past surgeries mentioned in the conversation, if any.)

- **Treatment Plan History**
  (Include any details about past treatment plans or suggestions made by healthcare professionals, including medications, surgeries, therapies, or lifestyle changes, if any.)

Provide the summary in plain text with the above attributes, and include information only if it is mentioned in the conversation. If any attribute is not discussed, omit it. Only include the summary in the generated response.Format the summary properly.\n\n
"""

response = text_gen_pipeline(
    prompt,
    max_new_tokens=400,
    do_sample=False,  # Deterministic for demonstration
)

# The output is a list of dict(s); each has "generated_text"
generated_text = response[0]["generated_text"]
# print("FULL OUTPUT:\n", generated_text)

# If the model replicates your training format, you'll see something like:
# "CONVERSATION: ... SUMMARY: <the actual summary text>"
# You can strip off everything before "SUMMARY:" if you only want the summary.
if "SUMMARY:" in generated_text:
    final_summary = generated_text.split("SUMMARY:", 1)[-1].strip()
else:
    final_summary = generated_text

print("\nEXTRACTED SUMMARY:\n", final_summary)


EXTRACTED SUMMARY:
 - **Symptoms**:
  (Include symptoms mentioned in the conversation, if any.)

- **Physical Activities**:
  (Include details about physical activities or excercise habits mentioned in the conversation, if any.)

- **Medical History**:
  (Include any relavent medical history mentioned in the conversation, if any.)

- **Surgical History**
  (Include any details about past surgeries mentioned in the conversation, if any.)

- **Treatment Plan History**
  (Include any details about past treatment plans or suggestions made by healthcare professionals, including medications, surgeries, therapies, or lifestyle changes, if any.)

Provide the summary in plain text with the above attributes, and include information only if it is mentioned in the conversation. If any attribute is not discussed, omit it. Only include the summary in the generated response.Format the summary properly.


Summary: The patient presented with a persistent fever, dry cough, and severe breathlessness. Sh

In [7]:
import evaluate
from tqdm import tqdm

# rouge = evaluate.load("rouge")

In [2]:
import torch
import bitsandbytes as bnb
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from peft import PeftModel

# -- Path to your LoRA weights + tokenizer --
model_dir = "./aloe_qwen"  

# -- 4-bit quantization config (same as training) --
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# -- 1) Load the *base* Qwen2.5 model in 4-bit --
base_model = AutoModelForCausalLM.from_pretrained(
    "HPAI-BSC/Qwen2.5-Aloe-Beta-7B",
    quantization_config=bnb_config,
    device_map="auto",
)

# -- 2) Load your fine-tuned LoRA adapters into the base model --
# The directory should contain adapter_model.bin, adapter_config.json, etc.
lora_model = PeftModel.from_pretrained(
    base_model, 
    model_dir,
    device_map="auto",
)

# -- 3) Load the tokenizer you saved to ./aloe_qwen --
tokenizer = AutoTokenizer.from_pretrained(model_dir)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Loading checkpoint shards: 100%|██████████| 4/4 [00:14<00:00,  3.71s/it]


In [None]:
from transformers import pipeline
summarizer = pipeline(
    "text-generation",
    model=lora_model,
    tokenizer=tokenizer,
    device_map="auto",  # Automatically place on GPU if available
)

notechat = load_dataset("akemiH/NoteChat")
eval_df = notechat["train"].select(range(5000, 7001)).to_pandas()

# 2) Clean up missing data if present
if eval_df.isnull().values.any():
    print("Found missing values in the evaluation set. Dropping them.")
    eval_df = eval_df.dropna()

# 3) Prepare batching parameters
batch_size = 4
num_samples = len(eval_df)
num_batches = (num_samples // batch_size) + int(num_samples % batch_size != 0)

# Lists to store predictions and references
predictions = []
references = []

# 4) Batching loop to generate summaries
for i in tqdm(range(num_batches), desc="Generating Summaries"):
    start = i * batch_size
    end = min(start + batch_size, num_samples)
    
    # Extract conversation and reference summary columns
    batch_conversations = eval_df["conversation"][start:end].tolist()
    batch_refs = eval_df["data"][start:end].tolist()  # "data" column for references
    
    # Prepare prompts replicating training format
    prompts = [
        f"CONVERSATION:\n{conv}\n\nSUMMARY:"
        for conv in batch_conversations
    ]
    
    # Generate summaries
    results = summarizer(
        prompts,
        max_new_tokens=100,
        do_sample=False,
        truncation=True,
        num_return_sequences=1  # Default is 1
    )
    
    # Parse results and extract summaries
    for item in results:
        # 'item' is a list with 1 dict => {"generated_text": "..."}
        output_dict = item[0]
        full_text = output_dict["generated_text"]
        
        # Extract only the part after "SUMMARY:"
        if "SUMMARY:" in full_text:
            extracted_summary = full_text.split("SUMMARY:", 1)[-1].strip()
        else:
            extracted_summary = full_text  # Fallback if marker not found
        
        predictions.append(extracted_summary)
    
    # Append the reference texts
    references.extend(batch_refs)

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'Jam

In [None]:
# Compute ROUGE scores
results = rouge.compute(predictions=predictions, references=references, use_stemmer=True)

print("ROUGE scores:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

###### predictions[0]

In [12]:
predictions[1021]

'The patient presented with a 12-month history of a purple nodular rash on her nasolabial folds. She reported no itching, burning, pain, or bleeding from the area, but did report drainage of clear fluid when applying pressure to the area. She has a history of cirrhosis, iron deficiency anemia, type II diabetes mellitus, hypertension, asthma, and endometrioid carcinoma of the ovary. A 3-mm punch biopsy of the right nasolab'

In [15]:
src_list = eval_df["conversation"].tolist()
ref_list = eval_df["data"].tolist()

In [16]:
import sys
sys.path.append("/home/jovyan/UniEval") 
from utils import convert_to_json
from metric.evaluator import get_evaluator
data = convert_to_json(
    src_list=src_list,
    ref_list=ref_list,
    output_list=output_list
)


In [13]:
output_list = []
for pred in predictions:
    # Ensure that "Summary:" exists in the string to avoid errors
    if len(pred) > 0:
        output_list.append(pred)
    else:
        # Handle cases where "Summary:" is missing (optional)
        output_list.append("")  # Or handle differently based on your needs

In [17]:
filtered_data = [
    entry for entry in data
    if entry["system_output"].strip()  # Ensure non-empty system_output
]

In [None]:
filtered_data

In [18]:
import sys
sys.path.append("/home/jovyan/UniEval") 
from utils import convert_to_json
from metric.evaluator import get_evaluator
task = "summarization"
evaluator = get_evaluator(task)


In [20]:
eval_scores = evaluator.evaluate(filtered_data, print_result=True)

Evaluating coherence of 2001 samples !!!


  0%|          | 0/251 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
100%|██████████| 251/251 [05:44<00:00,  1.37s/it]


Evaluating consistency of 2001 samples !!!


100%|██████████| 1171/1171 [21:27<00:00,  1.10s/it]


Evaluating fluency of 2001 samples !!!


100%|██████████| 1171/1171 [01:30<00:00, 12.97it/s]


Evaluating relevance of 2001 samples !!!


100%|██████████| 251/251 [03:41<00:00,  1.14it/s]


Evaluation scores are shown below:
+-------------+----------+
|  Dimensions |  Score   |
+-------------+----------+
|  coherence  | 0.966174 |
| consistency | 0.884436 |
|   fluency   | 0.886275 |
|  relevance  | 0.820143 |
|   overall   | 0.889257 |
+-------------+----------+





In [19]:
import json

# Suppose `filtered_data` is the final list of dicts you want to store:
# filtered_data = [
#   {"source": "...", "reference": "...", "system_output": "..."},
#   ...
# ]

with open("evaluation_data1.json", "w", encoding="utf-8") as f:
    # Write list of dictionaries to JSON file
    json.dump(filtered_data, f, indent=2, ensure_ascii=False)

print("Data saved to evaluation_data1.json")


Data saved to evaluation_data1.json


In [None]:
import json

with open("evaluation_data.json", "r", encoding="utf-8") as f:
    filtered_data = json.load(f)

print("Data loaded. Number of samples:", len(filtered_data))
