In [71]:
gt_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "The quick brown fox jumps over the lazy dog.",
     "To be or not to be, that is the question.",
    "All that glitters is not gold.",
    "The patient has cardiomegaly",
    "The patient has cardiomegaly",
]

gen_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "The fast brown fox leaped over the lazy dog.",
    "Being or not being, that's what's being questioned.",
    "Everything that sparkles isn't necessarily gold.",
    "The patient has cardiomegaly",
    "The patient has no cardiomegaly",
]

### Cider

In [72]:
from pycocoevalcap.cider import cider_scorer

In [73]:
scorer = cider_scorer.CiderScorer(n=4)
for gen, gt in zip(gen_texts, gt_texts):
    scorer += (gen, [gt])
mean_score, scores = scorer.compute_score()
print('mean_score:', mean_score)
print('scores:', scores)

mean_score: 4.7064514948768945
scores: [10.          3.20705125  0.74208407  0.73021884 10.          3.5593548 ]


### Rouge

In [74]:
from pycocoevalcap.rouge import rouge

In [75]:
scorer = rouge.Rouge()
scores = []
for gen, gt in zip(gen_texts, gt_texts):
    score = scorer.calc_score([gen], [gt])
    scores.append(score)
mean_score = sum(scores) / len(scores)
print('mean_score:', mean_score)
print('scores:', scores)

mean_score: 0.7060052418323793
scores: [1.0, 0.7777777777777778, 0.21785714285714283, 0.3333333333333333, 1.0, 0.9070631970260222]


### Bleu

In [76]:
from pycocoevalcap.bleu import bleu_scorer

In [87]:
scorer = bleu_scorer.BleuScorer(n=4)
for gen, gt in zip(gen_texts, gt_texts):
    scorer += (gen, [gt])
mean_score, scores = scorer.compute_score()
print('mean_score:', mean_score)
print('scores:')
for i in range(len(gen_texts)):
    b1 = scores[0][i]
    b2 = scores[1][i]
    b3 = scores[2][i]
    b4 = scores[3][i]
    b = (b1+b2+b3+b4)/4
    print(f'b1: {b1:.4f}, b2: {b2:.4f}, b3: {b3:.4f}, b4: {b4:.4f}, b: {b:.4f}')

mean_score: [0.6664715669720498, 0.578358029419633, 0.5130914126445008, 0.46274151089507487]
scores:
b1: 1.0000, b2: 1.0000, b3: 1.0000, b4: 1.0000, b: 1.0000
b1: 0.7778, b2: 0.6236, b3: 0.4807, b4: 0.3689, b: 0.5628
b1: 0.1947, b2: 0.1472, b3: 0.0000, b4: 0.0000, b: 0.0855
b1: 0.3333, b2: 0.0000, b3: 0.0000, b4: 0.0000, b: 0.0833
b1: 1.0000, b2: 1.0000, b3: 1.0000, b4: 1.0000, b: 1.0000
b1: 0.8000, b2: 0.6325, b3: 0.5109, b4: 0.0001, b: 0.4859


### Meteor

In [88]:
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

In [89]:
tokenized_gt_texts = [word_tokenize(x) for x in gt_texts]
tokenized_gen_texts = [word_tokenize(x) for x in gen_texts]
scores = []
for gen, gt in zip(tokenized_gen_texts, tokenized_gt_texts):
    score = meteor_score([gt], gen)
    scores.append(score)
mean_score = sum(scores) / len(scores)
print('mean_score:', mean_score)
print('scores:', scores)

mean_score: 0.8226083930326539
scores: [0.9995, 0.9995, 0.5852864583333333, 0.4445422535211268, 0.9921875, 0.9146341463414633]


### BERTScore

In [90]:
from bert_score import score as bert_score

In [91]:
P, R, F1 = bert_score(gen_texts, gt_texts, lang='en', verbose=True, use_fast_tokenizer=True)
P = P.cpu().numpy()
R = R.cpu().numpy()
F1 = F1.cpu().numpy()

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.05 seconds, 112.79 sentences/sec


In [92]:
print('P:', P)
print('R:', R)
print('F1:', F1)

P: [1.         0.9806174  0.93713343 0.9434316  1.         0.9791807 ]
R: [1.         0.98459214 0.9486927  0.95515096 1.         0.98743534]
F1: [1.         0.98260075 0.9428776  0.9492551  1.         0.98329073]


### CheXbert

In [93]:
from f1chexbert import F1CheXbert
from nltk.tokenize import sent_tokenize
from medvqa.utils.hashing import hash_string
from medvqa.utils.common import CACHE_DIR
from medvqa.utils.files import get_cached_pickle_file
from tqdm import tqdm
import numpy as np
import os

In [94]:
CHEXBERT_LABELS = [
    "Enlarged Cardiomediastinum", "Cardiomegaly", "Lung Opacity", "Lung Lesion", "Edema",
    "Consolidation", "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion", "Pleural Other",
    "Fracture", "Support Devices", "No Finding",
]

def merge_labels(labels_list):        
    merged = np.zeros((len(CHEXBERT_LABELS),), np.int8)
    merged[-1] = 1 # default to no findings
    for labels in labels_list:
        if labels[-1] == 0: # there is a finding
            merged[-1] = 0
        for i in range(0, len(labels)-1): # iterate over all labels except the last one
            if labels[i] == 1:
                merged[i] = 1
    return merged

In [95]:
class CheXbertLabeler(F1CheXbert):

    def __init__(self, device=None, verbose=False):
        super().__init__(device=device)
        self.cache_path = os.path.join(CACHE_DIR, 'chexbert_labeler_cache.pkl')
        self.cache = get_cached_pickle_file(self.cache_path)
        self.verbose = verbose
        if self.cache is None:
            self.cache = dict()
        elif verbose:
            print(f'Cache successfully loaded from {self.cache_path}')

    def get_labels(self, texts, update_cache_on_disk=False):

        if self.verbose:
            print(f'(*) Chexbert: labeling {len(texts)} texts ...')

        output_labels = [None] * len(texts)
        dirty_count = 0
        
        for i, text in tqdm(enumerate(texts), mininterval=2):
            text_hash = hash_string(text)
            if text_hash in self.cache:
                output_labels[i] = self.cache[text_hash]
                continue
            sentences = sent_tokenize(text)
            sentence_labels = []
            for sentence in sentences:
                hash = hash_string(sentence)
                labels = self.cache.get(hash, None)
                if labels is None:
                    labels = self.get_label(sentence)
                    self.cache[hash] = labels
                    dirty_count += 1
                sentence_labels.append(labels)
            output_labels[i] = merge_labels(sentence_labels)
            self.cache[text_hash] = output_labels[i]            

        if dirty_count > 0:
            if self.verbose:
                print(f'Done labeling: {dirty_count} new labels found and cached')
            if update_cache_on_disk:
                save_pickle(self.cache, self.cache_path)
                if self.verbose:
                    print(f'Cache successfully updated and saved to {self.cache_path}')
        elif self.verbose:
            print('All labels found in cache, no need to invoke chexbert labeler')
        
        return np.array(output_labels)
    
    def get_embedding(self, text):
        assert type(text) == str
        text = pd.Series([text])
        out = tokenize(text, self.tokenizer)
        batch = torch.LongTensor([o for o in out])
        src_len = [b.shape[0] for b in batch]
        attn_mask = generate_attention_masks(batch, src_len, self.device)
        final_hidden = self.model.bert(batch.to(self.device), attention_mask=attn_mask)[0]
        cls_hidden = final_hidden[:, 0, :].squeeze(dim=1)
        return cls_hidden.cpu().detach().numpy().squeeze()
    
    def get_embeddings(self, texts):
        assert type(texts) == list or type(texts) == np.ndarray
        return np.array([self.get_embedding(text) for text in tqdm(texts, mininterval=2)])

In [96]:
def compute_chexbert_f1_scores(labeler, gen_texts, gt_texts):
    gen_labels = labeler.get_labels(gen_texts)
    gt_labels = labeler.get_labels(gt_texts)
    f1_scores = np.zeros(len(gen_texts))
    for i in range(len(gen_texts)):
        tp = np.sum((gen_labels[i] == 1) & (gt_labels[i] == 1))
        fp = np.sum((gen_labels[i] == 1) & (gt_labels[i] == 0))
        fn = np.sum((gen_labels[i] == 0) & (gt_labels[i] == 1))
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1_scores[i] = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    return f1_scores

In [97]:
labeler = CheXbertLabeler(verbose=True)
f1_scores = compute_chexbert_f1_scores(labeler, gen_texts, gt_texts)
print('f1:', f1_scores)

Cache successfully loaded from /mnt/data/pamessina/workspaces/medvqa-workspace/cache/chexbert_labeler_cache.pkl
(*) Chexbert: labeling 6 texts ...


6it [00:00, 222.71it/s]


Done labeling: 2 new labels found and cached
(*) Chexbert: labeling 6 texts ...


6it [00:00, 20712.61it/s]

All labels found in cache, no need to invoke chexbert labeler
f1: [1. 1. 0. 1. 1. 0.]





### RadGraph

In [98]:
from radgraph import RadGraph

In [99]:
def extract_entities_and_relations(data):
    entities = data['entities']
    n = len(entities)
    e_strings = [None] * n
    hash2count = dict()
    for k, e in entities.items():
        i = int(k)-1
        e_strings[i] = f"{e['tokens']}|{e['label']}" # tokens|label
        h = hash_string(e_strings[i])
        hash2count[h] = hash2count.get(h, 0) + 1
    for k, e in entities.items():
        i = int(k)-1
        for r in e['relations']:
            j = int(r[1])-1
            rel_s1 = f"{e_strings[i]}|{r[0]}|{e_strings[j]}" # e1|rel|e2
            rel_s2 = f"{e_strings[i]}|{e_strings[j]}" # e1|e2
            h1 = hash_string(rel_s1)
            h2 = hash_string(rel_s2)
            hash2count[h1] = hash2count.get(h1, 0) + 1
            hash2count[h2] = hash2count.get(h2, 0) + 1
    return hash2count

def jaccard_between_dicts(d1, d2):
    if len(d1) == 0 and len(d2) == 0:
        return 1 # both dicts are empty -> perfect match
    inters_size = 0
    for k, c in d1.items():
        inters_size += min(c, d2.get(k, 0))
    union_size = sum(d1.values()) + sum(d2.values()) - inters_size
    assert union_size > 0
    return inters_size / union_size

def f1_between_dicts(gt_dict, pred_dict):
    if len(gt_dict) == 0 and len(pred_dict) == 0:
        return 1 # both dicts are empty -> perfect match
    inters_size = 0
    for k, c in gt_dict.items():
        inters_size += min(c, pred_dict.get(k, 0))
    pred_sum = sum(pred_dict.values())
    gt_sum = sum(gt_dict.values())
    p = inters_size / pred_sum if pred_sum > 0 else 0
    r = inters_size / gt_sum if gt_sum > 0 else 0
    return 2 * p * r / (p + r) if p + r > 0 else 0

def eval_radgraph(gen_texts, gt_texts):
    # Create instance of RadGraph
    radgraph = RadGraph()
    
    # Collect unique sentences
    sentences = set()
    sentences.update(gen_texts)
    sentences.update(gt_texts)
    sentences = list(sentences)
    s2idx = {s: i for i, s in enumerate(sentences)} # Map sentence to index

    # Compute RadGraph annotations
    annotations = radgraph(sentences)

    # Extract entities and relations
    ent_rel_dicts = [None] * len(sentences)
    for i in range(len(sentences)):
        ent_rel_dicts[i] = extract_entities_and_relations(annotations[str(i)])
    
    # Compute F1 score and Jaccard score
    f1_scores = []
    jaccard_scores = []
    for gt, gen in zip(gt_texts, gen_texts):
        gt_idx = s2idx[gt]
        gen_idx = s2idx[gen]
        f1_scores.append(f1_between_dicts(ent_rel_dicts[gt_idx], ent_rel_dicts[gen_idx]))
        jaccard_scores.append(jaccard_between_dicts(ent_rel_dicts[gt_idx], ent_rel_dicts[gen_idx]))
    
    # Return
    return f1_scores, jaccard_scores

In [100]:
f1_scores, jaccard_scores = eval_radgraph(gen_texts, gt_texts)

In [101]:
print('f1_scores:', f1_scores)
print('jaccard_scores:', jaccard_scores)

f1_scores: [1.0, 0.4444444444444444, 1, 0, 1.0, 0]
jaccard_scores: [1.0, 0.2857142857142857, 1, 0.0, 1.0, 0.0]
