In [1]:
gt_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "The quick brown fox jumps over the lazy dog.",
    "To be or not to be, that is the question.",
    "All that glitters is not gold.",
    "The patient has cardiomegaly",
    "The patient has cardiomegaly",
    "The patient",
]

gen_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "The fast brown fox leaped over the lazy dog.",
    "Being or not being, that's what's being questioned.",
    "Everything that sparkles isn't necessarily gold.",
    "The patient has cardiomegaly",
    "The patient has no cardiomegaly",
    "The subject",
]

### Cider

In [2]:
from pycocoevalcap.cider import cider_scorer

In [3]:
scorer = cider_scorer.CiderScorer(n=4)
for gen, gt in zip(gen_texts, gt_texts):
    scorer += (gen, [gt])
mean_score, scores = scorer.compute_score()
print('mean_score:', mean_score)
print('scores:', scores)

mean_score: 4.050422474728564
scores: [10.          3.30879368  0.74854162  0.73773126 10.          3.40067913
  0.15721163]


### Rouge

In [4]:
from pycocoevalcap.rouge import rouge

In [5]:
scorer = rouge.Rouge()
scores = []
for gen, gt in zip(gen_texts, gt_texts):
    score = scorer.calc_score([gen], [gt])
    scores.append(score)
mean_score = sum(scores) / len(scores)
print('mean_score:', mean_score)
print('scores:', scores)

mean_score: 0.6765759215706109
scores: [1.0, 0.7777777777777778, 0.21785714285714283, 0.3333333333333333, 1.0, 0.9070631970260222, 0.5]


### Bleu

In [6]:
from pycocoevalcap.bleu import bleu_scorer

In [7]:
scorer = bleu_scorer.BleuScorer(n=4)
for gen, gt in zip(gen_texts, gt_texts):
    scorer += (gen, [gt])
mean_score, scores = scorer.compute_score()
print('mean_score:', mean_score)
print('scores:')
for i in range(len(gen_texts)):
    b1 = scores[0][i]
    b2 = scores[1][i]
    b3 = scores[2][i]
    b4 = scores[3][i]
    b = (b1+b2+b3+b4)/4
    print(f'b1: {b1:.4f}, b2: {b2:.4f}, b3: {b3:.4f}, b4: {b4:.4f}, b: {b:.4f}')

mean_score: [0.6589154193334064, 0.5673484877926362, 0.506750741961736, 0.45857604536602137]
scores:
b1: 1.0000, b2: 1.0000, b3: 1.0000, b4: 1.0000, b: 1.0000
b1: 0.7778, b2: 0.6236, b3: 0.4807, b4: 0.3689, b: 0.5628
b1: 0.1947, b2: 0.1472, b3: 0.0000, b4: 0.0000, b: 0.0855
b1: 0.3333, b2: 0.0000, b3: 0.0000, b4: 0.0000, b: 0.0833
b1: 1.0000, b2: 1.0000, b3: 1.0000, b4: 1.0000, b: 1.0000
b1: 0.8000, b2: 0.6325, b3: 0.5109, b4: 0.0001, b: 0.4859
b1: 0.5000, b2: 0.0000, b3: 0.0000, b4: 0.0000, b: 0.1250


### Meteor

In [8]:
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

In [9]:
tokenized_gt_texts = [word_tokenize(x) for x in gt_texts]
tokenized_gen_texts = [word_tokenize(x) for x in gen_texts]
scores = []
for gen, gt in zip(tokenized_gen_texts, tokenized_gt_texts):
    score = meteor_score([gt], gen)
    scores.append(score)
mean_score = sum(scores) / len(scores)
print('mean_score:', mean_score)
print('scores:', scores)

mean_score: 0.7408071940279891
scores: [0.9995, 0.9995, 0.5852864583333333, 0.4445422535211268, 0.9921875, 0.9146341463414633, 0.25]


### BERTScore

In [15]:
from bert_score import BERTScorer
import torch
import torch.nn as nn

In [16]:
class BertScore(nn.Module):
    def __init__(self):
        super(BertScore, self).__init__()
        with torch.no_grad():
            self.bert_scorer = BERTScorer(model_type='distilbert-base-uncased',
                                          num_layers=5,
                                          batch_size=64,
                                          nthreads=4,
                                          all_layers=False,
                                          idf=False,
                                          device=None,
                                          lang='en',
                                          rescale_with_baseline=True,
                                          baseline_path=None)

    def forward(self, refs, hyps):
        p, r, f = self.bert_scorer.score(
            cands=hyps,
            refs=refs,
            verbose=False,
            batch_size=64,
        )
        return torch.mean(f).item(), f.tolist()

In [17]:
bert_score = BertScore()

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
x, y = bert_score(gen_texts, gt_texts)
print(x) # average f1
print(y) # instance f1s

0.802782416343689
[0.9999998211860657, 0.9180418848991394, 0.5307456851005554, 0.7448107004165649, 1.0, 0.9339528679847717, 0.491926372051239]


### CheXbert

In [28]:
from f1chexbert import F1CheXbert
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import numpy as np
import os
from pprint import pprint

In [20]:
_B = 1777771
_M = [999727999, 1070777777]
_MASK32 = (1 << 32) - 1
_MAXLEN = 1000000
_POW = [[None] * _MAXLEN, [None] * _MAXLEN] # _POW[k][i] = _B^i mod _M[k]
for k in range(2):
    _POW[k][0] = 1
    for i in range(1, _MAXLEN):
        _POW[k][i] = (_POW[k][i - 1] * _B) % _M[k]

def hash_string(s):
    hh = 0
    for k in range(2):
        h = 0
        for c in s:
            h = (h * _B + ord(c)) % _M[k]
        hh = (hh << 32) | h
    return (len(s), hh)

In [21]:
CHEXBERT_LABELS = [
    "Enlarged Cardiomediastinum", "Cardiomegaly", "Lung Opacity", "Lung Lesion", "Edema",
    "Consolidation", "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion", "Pleural Other",
    "Fracture", "Support Devices", "No Finding",
]

def merge_labels(labels_list):        
    merged = np.zeros((len(CHEXBERT_LABELS),), np.int8)
    merged[-1] = 1 # default to no findings
    for labels in labels_list:
        if labels[-1] == 0: # there is a finding
            merged[-1] = 0
        for i in range(0, len(labels)-1): # iterate over all labels except the last one
            if labels[i] == 1:
                merged[i] = 1
    return merged

In [31]:
class CheXbertLabeler(F1CheXbert):

    def __init__(self, device=None):
        super().__init__(device=device)
        self.cache = dict()

    def get_labels(self, texts):

        output_labels = [None] * len(texts)
        dirty_count = 0
        
        for i, text in tqdm(enumerate(texts), total=len(texts), mininterval=2):
            text_hash = hash_string(text)
            if text_hash in self.cache:
                output_labels[i] = self.cache[text_hash]
                continue
            sentences = sent_tokenize(text)
            sentence_labels = []
            for sentence in sentences:
                hash = hash_string(sentence)
                labels = self.cache.get(hash, None)
                if labels is None:
                    labels = self.get_label(sentence)
                    self.cache[hash] = labels
                    dirty_count += 1
                sentence_labels.append(labels)
            output_labels[i] = merge_labels(sentence_labels)
            self.cache[text_hash] = output_labels[i]
        
        return np.array(output_labels)

In [32]:
def f1(gt_binary_labels, gen_binary_labels):
    tp = np.sum((gen_binary_labels == 1) & (gt_binary_labels == 1))
    fp = np.sum((gen_binary_labels == 1) & (gt_binary_labels == 0))
    fn = np.sum((gen_binary_labels == 0) & (gt_binary_labels == 1))
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    return f1

def compute_chexbert_f1_scores(labeler, gen_texts, gt_texts):
    gen_labels = labeler.get_labels(gen_texts)
    gt_labels = labeler.get_labels(gt_texts)
    print(gen_labels.shape)
    print(gt_labels.shape)
    
    metrics = {}
    
    f1_scores = np.zeros(len(gen_texts))
    for i in range(len(gen_texts)):
        f1_scores[i] = f1(gt_labels[i], gen_labels[i])
    metrics['instance_f1s'] = f1_scores
    
    metrics['sample_avg_f1'] = f1_scores.mean()
    
    metrics['micro_avg_f1'] = f1(gt_labels.flatten(), gen_labels.flatten())
    
    metrics['class_f1s'] = {}
    class_f1s = np.empty((len(CHEXBERT_LABELS),))
    for i in range(len(CHEXBERT_LABELS)):
        class_f1s[i] = f1(gt_labels.T[i], gen_labels.T[i])   
        metrics['class_f1s'][CHEXBERT_LABELS[i]] = class_f1s[i]
        
    metrics['macro_avg_f1'] = class_f1s.mean()
    
    return metrics

In [33]:
labeler = CheXbertLabeler()
f1_scores = compute_chexbert_f1_scores(labeler, gen_texts, gt_texts)
pprint(f1_scores)

100%|█████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 85.70it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 193.21it/s]

(7, 14)
(7, 14)
{'class_f1s': {'Atelectasis': 0.0,
               'Cardiomegaly': 0.6666666666666666,
               'Consolidation': 0.0,
               'Edema': 0.0,
               'Enlarged Cardiomediastinum': 0.0,
               'Fracture': 0.0,
               'Lung Lesion': 0.0,
               'Lung Opacity': 0.0,
               'No Finding': 1.0,
               'Pleural Effusion': 0.0,
               'Pleural Other': 0.0,
               'Pneumonia': 0.0,
               'Pneumothorax': 0.0,
               'Support Devices': 0.0},
 'instance_f1s': array([1., 1., 0., 1., 1., 0., 1.]),
 'macro_avg_f1': 0.11904761904761904,
 'micro_avg_f1': 0.9090909090909091,
 'sample_avg_f1': 0.7142857142857143}





### RadGraph

In [34]:
from radgraph import RadGraph

In [35]:
def extract_entities_and_relations(data):
    entities = data['entities']
    n = len(entities)
    e_strings = [None] * n
    hash2count = dict()
    for k, e in entities.items():
        i = int(k)-1
        e_strings[i] = f"{e['tokens']}|{e['label']}" # tokens|label
        h = hash_string(e_strings[i])
        hash2count[h] = hash2count.get(h, 0) + 1
    for k, e in entities.items():
        i = int(k)-1
        for r in e['relations']:
            j = int(r[1])-1
            rel_s1 = f"{e_strings[i]}|{r[0]}|{e_strings[j]}" # e1|rel|e2
            rel_s2 = f"{e_strings[i]}|{e_strings[j]}" # e1|e2
            h1 = hash_string(rel_s1)
            h2 = hash_string(rel_s2)
            hash2count[h1] = hash2count.get(h1, 0) + 1
            hash2count[h2] = hash2count.get(h2, 0) + 1
    return hash2count

def jaccard_between_dicts(d1, d2):
    if len(d1) == 0 and len(d2) == 0:
        return 1 # both dicts are empty -> perfect match
    inters_size = 0
    for k, c in d1.items():
        inters_size += min(c, d2.get(k, 0))
    union_size = sum(d1.values()) + sum(d2.values()) - inters_size
    assert union_size > 0
    return inters_size / union_size

def f1_between_dicts(gt_dict, pred_dict):
    if len(gt_dict) == 0 and len(pred_dict) == 0:
        return 1 # both dicts are empty -> perfect match
    inters_size = 0
    for k, c in gt_dict.items():
        inters_size += min(c, pred_dict.get(k, 0))
    pred_sum = sum(pred_dict.values())
    gt_sum = sum(gt_dict.values())
    p = inters_size / pred_sum if pred_sum > 0 else 0
    r = inters_size / gt_sum if gt_sum > 0 else 0
    return 2 * p * r / (p + r) if p + r > 0 else 0

def eval_radgraph(gen_texts, gt_texts):
    # Create instance of RadGraph
    radgraph = RadGraph()
    
    # Collect unique sentences
    sentences = set()
    sentences.update(gen_texts)
    sentences.update(gt_texts)
    sentences = list(sentences)
    s2idx = {s: i for i, s in enumerate(sentences)} # Map sentence to index

    # Compute RadGraph annotations
    annotations = radgraph(sentences)

    # Extract entities and relations
    ent_rel_dicts = [None] * len(sentences)
    for i in range(len(sentences)):
        ent_rel_dicts[i] = extract_entities_and_relations(annotations[str(i)])
    
    # Compute F1 score and Jaccard score
    f1_scores = []
    jaccard_scores = []
    for gt, gen in zip(gt_texts, gen_texts):
        gt_idx = s2idx[gt]
        gen_idx = s2idx[gen]
        f1_scores.append(f1_between_dicts(ent_rel_dicts[gt_idx], ent_rel_dicts[gen_idx]))
        jaccard_scores.append(jaccard_between_dicts(ent_rel_dicts[gt_idx], ent_rel_dicts[gen_idx]))
    
    # Return
    return f1_scores, jaccard_scores

In [36]:
f1_scores, jaccard_scores = eval_radgraph(gen_texts, gt_texts)

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [38]:
print('f1_scores:', f1_scores)
print('jaccard_scores:', jaccard_scores)

f1_scores: [1.0, 0.4444444444444444, 1, 0, 1.0, 0, 1]
jaccard_scores: [1.0, 0.2857142857142857, 1, 0.0, 1.0, 0.0, 1]


### RadCliQ

In [2]:
from medvqa.metrics.medical.radcliq import invoke_radcliq_process

In [5]:
out = invoke_radcliq_process(gt_reports=gt_texts, gen_reports=gen_texts, device_id="1", verbose=True)

Running RadCliQ over 7 reports ...
	Command = cd /home/pamessina/CXR-Report-Metric/ && conda run -n radcliqenv python run_metric.py --gt_reports_path "/mnt/data/pamessina/workspaces/medvqa-workspace/tmp/radcliq/radcliq-gt_20240613_213301.csv" --gen_reports_path "/mnt/data/pamessina/workspaces/medvqa-workspace/tmp/radcliq/radcliq-gen_20240613_213301.csv" --output_path "/mnt/data/pamessina/workspaces/medvqa-workspace/tmp/radcliq/radcliq-output_20240613_213301.csv"
\RadCliq process done. Elapsed seconds = 116.10060834884644
\RadCliq process done. Elapsed seconds = 116.10072159767151
stdout: Number of shared study IDs: 7

Tokenizing report impressions. All reports are cut off at 512 tokens.
Using 1 GPUs!

Begin report impression labeling. The progress bar counts the # of batches completed:
The batch size is 18

Tokenizing report impressions. All reports are cut off at 512 tokens.
Using 1 GPUs!

Begin report impression labeling. The progress bar counts the # of batches completed:
The batch 

In [6]:
out

{'bleu_score': array([1.        , 0.66666667, 0.16344984, 0.26726124, 1.        ,
        0.63245553, 0.2236068 ]),
 'bertscore': array([1.        , 0.8820054 , 0.51888853, 0.69265145, 1.        ,
        0.9269224 , 0.5359155 ]),
 'semb_score': array([1.00000012, 0.97885698, 0.77980494, 0.88076377, 1.00000012,
        0.69918996, 0.95818412]),
 'radgraph_combined': array([1.  , 0.45, 0.  , 0.  , 0.5 , 0.  , 0.  ]),
 'RadCliQ-v0': array([-0.24213365,  0.78860724,  2.61022362,  1.97220865,  0.55238347,
         1.86797567,  2.26775342]),
 'RadCliQ-v1': array([-1.4406904 , -0.53999798,  0.81781048,  0.43052379, -0.81808224,
         0.22440321,  0.61225504])}