In [None]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4324fed7874291726d7a4d177540fb271a78e3ef036c3e96e91a2c04a139c218
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from peft import PeftModel, PeftConfig
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import numpy as np
from tqdm import tqdm

# Loading the finetuned model parameters:

In [None]:
hf_repo_id = "PavansaiGundaram/fine_tuned_qwen_medical_qa_updated_version"

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(hf_repo_id, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Initializing the model:

In [None]:
# Load the fine-tuned LoRA model
peft_config = PeftConfig.from_pretrained(hf_repo_id)
base_model = AutoModelForCausalLM.from_pretrained(
    peft_config.base_model_name_or_path,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, hf_repo_id, is_trainable=False)
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 896)
        (layers): ModuleList(
          (0-23): 24 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=896, out_features=896, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=896, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=896, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
     

# Loading the test dataset:

In [None]:
# Load the test dataset
ds = load_dataset("eswardivi/medical_qa")
ds = ds.remove_columns(["input"]).rename_column("instruction", "input")
train_test_split = ds["train"].train_test_split(test_size=307, train_size=6000, shuffle=True, seed=42)
test_dataset = train_test_split["test"]

In [None]:
# Function to format the prompt for evaluation
def prompt_instruction_format(sample):
    return f"""### Instruction:
Provide a concise and accurate medical answer in one sentence based on the input below. If the information is unknown, respond with 'I don’t know.'

### Input:
{sample['input']}

### Response:
"""

# Initializing the function for generating the response:

In [None]:
# Function to generate model response
def generate_response(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            num_beams=3,
            early_stopping=True
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the response part after "### Response:"
    response = response.split("### Response:")[-1].strip() if "### Response:" in response else response.strip()
    return response

# Evaluation metrics:

In [None]:
# Evaluation metrics
def compute_metrics(predictions, references):
    bleu_scores = []
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge1_scores = []
    rougeL_scores = []
    exact_matches = []
    smoothing = SmoothingFunction().method1  # Smoothing to handle zero BLEU scores

    for pred, ref in zip(predictions, references):
        # BLEU score with smoothing
        bleu = sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothing)
        bleu_scores.append(bleu)

        # ROUGE score
        rouge_scores = rouge_scorer_instance.score(ref, pred)
        rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
        rougeL_scores.append(rouge_scores['rougeL'].fmeasure)

        # Exact match
        exact_matches.append(1 if pred.strip() == ref.strip() else 0)

    return {
        "avg_bleu": np.mean(bleu_scores),
        "avg_rouge1": np.mean(rouge1_scores),
        "avg_rougeL": np.mean(rougeL_scores),
        "exact_match_accuracy": np.mean(exact_matches)
    }

# Performing predictions:

In [None]:
# Evaluate on test dataset
predictions = []
references = []

print("Generating predictions for test dataset...")
for sample in tqdm(test_dataset, desc="Evaluating"):
    prompt = prompt_instruction_format(sample)
    pred = generate_response(prompt)
    predictions.append(pred)
    references.append(sample['output'])

Generating predictions for test dataset...


Evaluating: 100%|██████████| 307/307 [1:40:53<00:00, 19.72s/it]


In [None]:

# Compute metrics
metrics = compute_metrics(predictions, references)

# Print evaluation results
print("\nEvaluation Results:")
print(f"Average BLEU Score: {metrics['avg_bleu']:.4f}")
print(f"Average ROUGE-1 F1 Score: {metrics['avg_rouge1']:.4f}")
print(f"Average ROUGE-L F1 Score: {metrics['avg_rougeL']:.4f}")
print(f"Exact Match Accuracy: {metrics['exact_match_accuracy']:.4f}")

# Print sample predictions for debugging
print("\nSample Predictions vs References (First 5):")
for i in range(min(5, len(predictions))):
    print(f"\nSample {i+1}:")
    print(f"Input: {test_dataset[i]['input']}")
    print(f"Prediction: {predictions[i]}")
    print(f"Reference: {test_dataset[i]['output']}")
    print(f"Tokenized Prediction Length: {len(tokenizer.encode(predictions[i]))}")
    print(f"Tokenized Reference Length: {len(tokenizer.encode(test_dataset[i]['output']))}")


Evaluation Results:
Average BLEU Score: 0.0424
Average ROUGE-1 F1 Score: 0.2351
Average ROUGE-L F1 Score: 0.1889
Exact Match Accuracy: 0.0000

Sample Predictions vs References (First 5):

Sample 1:
Input: Hello, sir, your jaw is in pain, especially when you chew or yawn.
Prediction: The symptoms seem to indicate that you're suffering from jaw pain. <|END_OF_TEXT|}
Reference: In order to confirm the diagnosis, you need to undergo a diagnostic procedure such as an ophthalmology, evaluation, diagnosis, and ophthalmology, as well as a diagnostic procedure for ophthalmology, treatment, nose, mouth, and pharynx, which may require a central line of veins to be inserted to help diagnose and treat the disorder.
Tokenized Prediction Length: 19
Tokenized Reference Length: 71

Sample 2:
Input: I will try anything but respectfully how do you account for the desat cycle to 70s for an hr and a half after thr bipap is off?
Prediction: I'm not sure what you mean by "desat cycle" <|END_OF_TEXT|} <|END_