In [259]:
from medvqa.utils.files import load_pickle, load_json_file, save_to_pickle
from medvqa.datasets.iuxray import IUXRAY_CACHE_DIR
from medvqa.datasets.mimiccxr import MIMICCXR_CACHE_DIR
from medvqa.utils.common import CACHE_DIR
from medvqa.datasets.medical_tags_extractor import MedicalTagsExtractor
import medvqa
from PIL import Image
import random
import os
from importlib import reload

In [59]:
reload(medvqa.datasets.medical_tags_extractor)

<module 'medvqa.datasets.medical_tags_extractor' from '/home/pamessina/medvqa/medvqa/datasets/medical_tags_extractor.py'>

In [4]:
iuxray_qa_adapted_reports_path = os.path.join(IUXRAY_CACHE_DIR, "qa_adapted_reports__20220324_143133.json")
iuxray_qa_adapted_reports = load_json_file(iuxray_qa_adapted_reports_path)

In [260]:
mimiccxr_qa_adapted_reports_path = os.path.join(MIMICCXR_CACHE_DIR, "qa_adapted_reports__20220324_151809.json")
mimiccxr_qa_adapted_reports = load_json_file(mimiccxr_qa_adapted_reports_path)

In [13]:
iuxray_chexpert_labels = load_pickle(os.path.join(IUXRAY_CACHE_DIR, "chexpert_labels_per_report__20220405_160451.pkl"))

In [261]:
mimiccxr_chexpert_labels = load_pickle(os.path.join(MIMICCXR_CACHE_DIR, "chexpert_labels_per_report__20220405_160542.pkl"))

In [262]:
assert len(iuxray_chexpert_labels) == len(iuxray_qa_adapted_reports['reports'])

In [263]:
assert len(mimiccxr_chexpert_labels) == len(mimiccxr_qa_adapted_reports['reports'])

In [15]:
for i, labels in enumerate(iuxray_chexpert_labels):
    if labels[0] == 1:
        assert all(x == 0 for x in labels[1:]), (labels, iuxray_qa_adapted_reports['reports'][i])

In [264]:
for i, labels in enumerate(mimiccxr_chexpert_labels):
    if labels[0] == 1:
        assert all(x == 0 for x in labels[1:]), (labels, mimiccxr_qa_adapted_reports['reports'][i])

In [35]:
def healthy_and_unhealthy_reports(chexpert_labels):
    healthy_ids = []
    unhealthy_ids = []
    for i, labels in enumerate(chexpert_labels):
        if labels[0] == 1:
            healthy_ids.append(i)
        else:
            unhealthy_ids.append(i)
    return healthy_ids, unhealthy_ids

In [265]:
iu_h_ids, iu_unh_ids = healthy_and_unhealthy_reports(iuxray_chexpert_labels)
len(iu_h_ids), len(iu_unh_ids)

(1416, 2511)

In [266]:
mi_h_ids, mi_unh_ids = healthy_and_unhealthy_reports(mimiccxr_chexpert_labels)
len(mi_h_ids), len(mi_unh_ids)

(37178, 190657)

In [267]:
iuxray_chexpert_labels[random.choice(iu_h_ids)]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int8)

In [268]:
iuxray_chexpert_labels[random.choice(iu_unh_ids)]

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int8)

In [272]:
mimiccxr_chexpert_labels[random.choice(mi_h_ids)]

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int8)

In [277]:
mimiccxr_chexpert_labels[random.choice(mi_unh_ids)]

array([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1], dtype=int8)

In [61]:
med_tags_extractor = MedicalTagsExtractor('medical_terms_frequency__20220317_145545.pkl')

In [278]:
def show_iuxray_example(idx, reports):    
    report = reports[idx]
    report = '. '.join(report['sentences'][i] for i in report['matched'])
    print('Report:\n')
    print(report)
    print('\nTags:\n')
    print(med_tags_extractor.extract_tags_sequence(report))

In [289]:
iuxray_qa_adapted_reports['questions']

['ARDS?',
 'COPD?',
 'abscess and cavitation?',
 'adenopathy?',
 'air collections?',
 'air space disease?',
 'air-fluid level?',
 'airways?',
 'apical zone?',
 'ascites?',
 'aspiration?',
 'atelectasis?',
 'azygos lobe?',
 'azygos vein?',
 'bleeding?',
 'blurring?',
 'bones?',
 'bowel obstruction and loops?',
 'bowel?',
 'bronchiectasis?',
 'bronchogram?',
 'bronchovascular crowding?',
 'bronchus?',
 'bullae and blebs?',
 'calcification?',
 'cancer or tumor?',
 'cardiac silhouette?',
 'cardiomegaly?',
 'carina?',
 'cholelithiasis?',
 'colon?',
 'congestive heart failure CHF?',
 'consolidation?',
 'contrast?',
 'densities?',
 'diaphragm?',
 'edema?',
 'emphysema?',
 'esophagus?',
 'fibrosis?',
 'fissures?',
 'fluid overload?',
 'fluid?',
 'fractures?',
 'free air?',
 'gas distension?',
 'granumola?',
 'haze?',
 'heart?',
 'hernias?',
 'infection?',
 'infiltrate?',
 'interstitial lung disease?',
 'kerley lines?',
 'lesions?',
 'liver?',
 'loculation?',
 'lucencies?',
 'lung parenchyma?',

In [279]:
x = random.choice(iu_h_ids)
x, show_iuxray_example(x, iuxray_qa_adapted_reports['reports'])

Report:

Heart size is normal. No focal infiltrates. No pneumothorax. No large pleural effusion. Mildly tortuous aorta. No acute cardiopulmonary findings

Tags:

['heart', 'size', 'normal', 'no', 'focal', 'infiltrate', 'no', 'pneumothorax', 'no', 'large', 'pleural', 'effusion', 'mild', 'tortuous', 'aorta', 'no', 'cardiopulmonary']


(1995, None)

In [280]:
x = random.choice(iu_unh_ids)
x, show_iuxray_example(x, iuxray_qa_adapted_reports['reports'])

Report:

The cardiomediastinal silhouette and pulmonary vasculature are within normal limits in size. The lungs are clear of focal airspace disease , pneumothorax , or pleural effusion. XXXX closure device demonstrated projecting over the right heart. There are no acute bony findings. No acute cardiopulmonary findings

Tags:

['cardiomediastinal', 'silhouette', 'pulmonary', 'vasculature', 'normal', 'limits', 'size', 'lungs', 'clear', 'focal', 'airspace', 'disease', 'pneumothorax', 'pleural', 'effusion', 'closure', 'devices', 'projecting', 'right', 'heart', 'no', 'osseous', 'no', 'cardiopulmonary']


(2029, None)

In [283]:
x = random.choice(mi_h_ids)
x, show_iuxray_example(x, mimiccxr_qa_adapted_reports['reports'])

Report:

Normal cardiomediastinal contours. Tortuous thoracic aorta. Clear lungs with no evidence of pneumothorax or pleural effusion

Tags:

['normal', 'cardiomediastinal', 'contours', 'tortuous', 'thoracic', 'aorta', 'clear', 'lungs', 'no', 'pneumothorax', 'pleural', 'effusion']


(176265, None)

In [287]:
x = random.choice(mi_unh_ids)
x, show_iuxray_example(x, mimiccxr_qa_adapted_reports['reports'])

Report:

The lungs are clear. No focal pulmonary consolidation , pulmonary edema , pleural effusion , or pneumothorax. Normal cardiomediastinal silhouette , pleura , hila. Elevated right hemidiaphragm with associated incidental note of the colon interposed between the liver and diaphragm in the sub-diaphragmatic right upper quadrant of the abdomen. Stable appearance of the thoracic spinal fixation devices and prior kyphoplasty. Slight pectus excavatum. No acute osseous abnormality. No acute cardiopulmonary process including no focal consolidation to suggest pneumonia. Chilaiditi syndrome with associated elevation of the right hemidiaphragm

Tags:

['lungs', 'clear', 'no', 'focal', 'pulmonary', 'consolidation', 'pulmonary', 'edema', 'pleural', 'effusion', 'pneumothorax', 'normal', 'cardiomediastinal', 'silhouette', 'pleural', 'hilar', 'elevation', 'right', 'hemidiaphragm', 'colon', 'interposed', 'liver', 'diaphragm', 'sub', 'diaphragm', 'right', 'upper', 'quadrant', 'abdomen', 'thoracic

(160915, None)

In [225]:
def _get_top_k_ngrams(reports, q_id, used_vocab, k=10, n=3):
    freq = dict()
    for report in reports:
        if q_id not in report['qa']:
            continue
        for i in report['qa'][q_id]:
            tags = med_tags_extractor.extract_tags_sequence(report['sentences'][i])
            for j in range(len(tags)-n+1):
                key = tuple(tags[j:j+n])
                freq[key] = freq.get(key, 0) + 1
    pairs = [(f,k) for k,f in freq.items()]
    pairs.sort(reverse=True)
    output = []
    for p in pairs:
        valid = True
        for w in p[1]:
            if w in used_vocab:
                valid = False
                break
        if valid:
            output.append(p)
            used_vocab.update(p[1])
            if len(output) == k:
                break
    return output

def get_top_k_ngrams(reports, q_id, ks, ns):
    used_vocab = set()
    output = []
    for k, n in zip(ks, ns):
        pairs = _get_top_k_ngrams(reports, q_id, used_vocab, k, n)
        output.extend(pairs)
    output.sort()
    return output

In [297]:
x = (y for y in range(10))

In [306]:
True == 

True

In [299]:
for y in iter(x): print(y)

In [226]:
iuxray_qa_adapted_reports['reports'][0]

{'sentences': ['The cardiac silhouette and mediastinum size are within normal limits',
  'There is no pulmonary edema',
  'There is no focal consolidation',
  'There are no XXXX of a pleural effusion.',
  'There is no evidence of pneumothorax',
  'Normal chest x-XXXX.'],
 'invalid': [3, 5],
 'unmatched': [],
 'matched': [0, 1, 2, 4],
 'qa': {'26': [0], '63': [0], '36': [1], '32': [2], '73': [4]},
 'filename': '1.xml'}

In [206]:
# iuxray_qa_adapted_reports['questions']

In [252]:
q_id = str(iuxray_qa_adapted_reports['questions'].index('support devices and foreign bodies?'))
top_ngrams = get_top_k_ngrams(iuxray_qa_adapted_reports['reports'], q_id, ks=[50, 50], ns=[2, 1])

In [253]:
top_ngrams

[(2, ('focal', 'air')),
 (2, ('hilar', 'contours')),
 (2, ('humeral', 'prosthetic')),
 (2, ('lung', 'volumes')),
 (2, ('osseous', 'anchors')),
 (2, ('radiopaque', 'density')),
 (2, ('screws', 'fixation')),
 (2, ('space', 'consolidation')),
 (2, ('terminates', 'lower')),
 (2, ('thoracic', 'spine')),
 (2, ('tunneled', 'dialysis')),
 (3, ('base', 'atelectasis')),
 (3, ('bullet', 'fragment')),
 (3, ('generator', 'projecting')),
 (3, ('icd', 'position')),
 (3, ('limits', 'mediastinal')),
 (3, ('vp', 'shunt')),
 (4, ('bilateral', 'breast')),
 (4, ('closure', 'devices')),
 (4, ('nerve', 'stimulator')),
 (4, ('post',)),
 (4, ('small',)),
 (4, ('vertebral',)),
 (5, ('abdominal',)),
 (5, ('aicd',)),
 (5, ('below', 'diaphragm')),
 (5, ('cardiomegaly',)),
 (5, ('cava',)),
 (5, ('configuration',)),
 (5, ('course',)),
 (5, ('disease',)),
 (5, ('fracture',)),
 (5, ('hemithorax',)),
 (5, ('midline',)),
 (5, ('pneumothorax',)),
 (5, ('pulmonary',)),
 (5, ('rib',)),
 (5, ('stool',)),
 (5, ('subclavian',

In [254]:
def classify_sentence(s, top_ngrams, max_n):
    tags = med_tags_extractor.extract_tags_sequence(s)
    ngram_sets = [set() for _ in range(max_n)]
    for n in range(1, max_n+1):
        for i in range(len(tags)-n+1):
            ngram = tuple(tags[i:i+n])
            ngram_sets[n-1].add(ngram)
    for i, ngram in enumerate(top_ngrams):
        if ngram[1] in ngram_sets[len(ngram[1])-1]:
            return i
    return len(top_ngrams)

In [255]:
def classify_sentences(reports, q_id, top_ngrams, max_n):
    freqs = dict()
    for report in reports:
        if q_id not in report['qa']:
            continue
        for i in report['qa'][q_id]:
            x = classify_sentence(report['sentences'][i], top_ngrams, max_n)
            freqs[x] = freqs.get(x, 0) + 1
    return freqs

In [256]:
classify_sentence('The cardiac silhouette and mediastinum size are within normal limits', top_ngrams, 2)

61

In [258]:
classify_sentences(iuxray_qa_adapted_reports['reports'], q_id, top_ngrams, 2)

{100: 71,
 4: 2,
 83: 8,
 93: 10,
 40: 2,
 99: 14,
 96: 8,
 98: 37,
 35: 4,
 79: 10,
 84: 14,
 92: 17,
 19: 4,
 60: 5,
 28: 5,
 91: 10,
 95: 14,
 90: 7,
 44: 3,
 29: 1,
 17: 4,
 53: 4,
 80: 6,
 69: 6,
 64: 9,
 57: 4,
 65: 6,
 76: 8,
 50: 4,
 16: 3,
 55: 4,
 25: 5,
 34: 5,
 87: 4,
 89: 6,
 72: 5,
 18: 4,
 77: 4,
 82: 6,
 1: 2,
 62: 5,
 33: 3,
 27: 5,
 63: 7,
 36: 4,
 78: 7,
 43: 5,
 66: 7,
 67: 5,
 68: 2,
 41: 6,
 85: 11,
 26: 5,
 8: 2,
 86: 13,
 20: 4,
 11: 2,
 32: 2,
 0: 2,
 21: 3,
 37: 5,
 38: 4,
 47: 4,
 24: 5,
 88: 2,
 22: 3,
 94: 7,
 31: 5,
 59: 5,
 23: 5,
 73: 2,
 52: 5,
 15: 2,
 12: 3,
 56: 5,
 39: 5,
 13: 3,
 61: 6,
 45: 4,
 2: 2,
 49: 2,
 81: 8,
 42: 3,
 46: 5,
 3: 1,
 5: 2,
 51: 7,
 54: 3,
 6: 2,
 58: 1,
 9: 1,
 30: 5,
 10: 2,
 14: 3,
 70: 3,
 97: 2}

In [399]:
!python ../../scripts/precompute_balanced_dataloading_metadata.py \
        --iuxray-qa-dataset-filename "qa_adapted_reports__20220324_143133.json" \
        --mimiccxr-qa-dataset-filename "qa_adapted_reports__20220324_151809.json" \
        --chexpert-labels-cache-filename "precomputed_chexpert_labels_20220331_065722.pkl" \
        --medical-terms-frequency-filename "medical_terms_frequency__20220324_165005.pkl"

Loading files ...
Loading /home/pamessina/medvqa-workspace/cache/vocab__min_freq=5__from(qa_adapted_reports__20220324_143133.json;qa_adapted_reports__20220324_151809.json).pkl ...
Precomputing metadata ...
3927it [00:01, 2784.22it/s]
3927it [00:00, 8726.93it/s]
Balanced dataloading metadata saved to /home/pamessina/medvqa-workspace/cache/iuxray/balanced_dataloading_metadata__20220407_150040.pkl
227835it [01:59, 1909.64it/s]
227835it [00:42, 5385.59it/s]
Balanced dataloading metadata saved to /home/pamessina/medvqa-workspace/cache/mimiccxr/balanced_dataloading_metadata__20220407_150400.pkl


In [593]:
from medvqa.utils.files import load_pickle
from collections import Counter

In [594]:
iuxray_balanced_metadata = load_pickle('/home/pamessina/medvqa-workspace/cache/iuxray/balanced_dataloading_metadata__20220407_150040.pkl')

In [595]:
mimiccxr_balanced_metadata = load_pickle('/home/pamessina/medvqa-workspace/cache/mimiccxr/balanced_dataloading_metadata__20220407_150400.pkl')

In [596]:
def get_answers(q_id, metadata, reports):
    output = []
    for ri, report in enumerate(reports):
        if q_id in report['qa']:
            answer = '. '.join(report['sentences'][i] for i in report['qa'][q_id])
            output.append((metadata['healthy'][ri][q_id],
                           metadata['tags_based_class'][ri][q_id],
                           answer))
    return output

In [597]:
iuxray_qid2answers = {}
for qid in range(len(iuxray_qa_adapted_reports['questions'])):
    iuxray_qid2answers[qid] = get_answers(str(qid), iuxray_balanced_metadata, iuxray_qa_adapted_reports['reports'])

In [598]:
mimiccxr_qid2answers = {}
for qid in range(len(mimiccxr_qa_adapted_reports['questions'])):
    mimiccxr_qid2answers[qid] = get_answers(str(qid), mimiccxr_balanced_metadata, mimiccxr_qa_adapted_reports['reports'])

In [612]:
def print_health_statistics(qid2answers, questions):
    for qid, answers in qid2answers.items():
        print("------------------")
        print(qid, questions[qid])
        print(Counter(x[0] for x in answers))

In [600]:
# print_health_statistics(mimiccxr_qid2answers, mimiccxr_qa_adapted_reports['questions'])

In [613]:
print_health_statistics(iuxray_qid2answers, iuxray_qa_adapted_reports['questions'])

------------------
0 ARDS?
Counter()
------------------
1 COPD?
Counter({0: 38, 1: 5})
------------------
2 abscess and cavitation?
Counter({1: 3})
------------------
3 adenopathy?
Counter({0: 67, 1: 31})
------------------
4 air collections?
Counter({1: 55, 0: 6})
------------------
5 air space disease?
Counter({1: 399, 0: 147})
------------------
6 air-fluid level?
Counter({0: 4, 1: 4})
------------------
7 airways?
Counter({1: 24, 0: 4})
------------------
8 apical zone?
Counter({0: 78, 1: 29})
------------------
9 ascites?
Counter()
------------------
10 aspiration?
Counter({0: 10, 1: 3})
------------------
11 atelectasis?
Counter({0: 351, 1: 9})
------------------
12 azygos lobe?
Counter({1: 2})
------------------
13 azygos vein?
Counter({0: 1})
------------------
14 bleeding?
Counter()
------------------
15 blurring?
Counter()
------------------
16 bones?
Counter({1: 1169, 0: 765})
------------------
17 bowel obstruction and loops?
Counter({1: 7, 0: 1})
------------------
18 bowe

In [606]:
_q = 'stomach?'
# _q = iuxray_qa_adapted_reports['questions'][74]
_qid = iuxray_qa_adapted_reports['questions'].index(_q)
answers = get_answers(str(_qid), iuxray_balanced_metadata, iuxray_qa_adapted_reports['reports'])

In [607]:
_q, len(answers)

('stomach?', 18)

In [609]:
iuxray_balanced_metadata['top_ngrams'][(_qid, 1)]

[(1, ('contours', 'normal')),
 (1, ('distal',)),
 (1, ('fluid',)),
 (1, ('inferiorly',)),
 (1, ('large',)),
 (1, ('loops',)),
 (1, ('mild',)),
 (1, ('multiple', 'distended')),
 (1, ('not',)),
 (1, ('postoperative', 'esophagectomy')),
 (1, ('pull',)),
 (1, ('rectal',)),
 (1, ('removed', 'gastric')),
 (1, ('small', 'bowel')),
 (1, ('suction', 'tube')),
 (1, ('tip', 'course')),
 (3, ('cardiac', 'apex')),
 (3, ('sided',)),
 (3, ('stomach', 'left'))]

In [611]:
[x for x in answers if x[0] == 0]

[(0,
  0,
  'Catheter tubing overlies the left upper quadrant , appearance of the lateral projection suggest prior gastric banding procedure'),
 (0,
  6,
  'The stomach is distended with an air-fluid level. Large hiatal hernia with dilated intrathoracic stomach'),
 (0, 11, 'Nasogastric tube tip XXXX within the stomach body'),
 (0, 17, 'Enteric tube tip in the stomach'),
 (0,
  2,
  'Large hiatal hernia is identified containing stomach and colon. Stable appearance of large hiatal hernia containing stomach and large bowel as well as possible small bowel loops'),
 (0,
  4,
  'Enteric tube is again noted , coursing below the diaphragm the tip of which is seen projecting over the expected location of the body of the stomach'),
 (0, 3, 'To the stomach contours appear grossly clear')]

In [397]:
Counter([x[1] for x in answers if x[0] == 1])

Counter({32: 1,
         49: 2,
         47: 2,
         99: 13,
         98: 35,
         100: 15,
         66: 2,
         60: 1,
         94: 3,
         78: 5,
         63: 3,
         89: 3,
         91: 5,
         97: 12,
         40: 1,
         37: 1,
         2: 1,
         81: 4,
         95: 2,
         82: 5,
         50: 1,
         74: 2,
         24: 1,
         10: 1,
         92: 4,
         34: 1,
         55: 2,
         44: 1,
         64: 2,
         18: 1,
         93: 1,
         59: 2,
         88: 2,
         96: 11,
         72: 1,
         16: 1,
         19: 1,
         75: 2,
         17: 1,
         80: 4,
         38: 1,
         71: 2,
         62: 3,
         56: 1,
         29: 1,
         61: 2,
         20: 1,
         77: 2,
         11: 1,
         53: 2,
         12: 1,
         5: 1,
         70: 3,
         58: 2,
         51: 2,
         76: 2,
         14: 1,
         27: 1,
         0: 1,
         26: 1,
         9: 1,
         48: 2,
      

In [509]:
from torch.utils.data import Dataset

In [477]:
class AtomicDataset(Dataset):
    def __init__(self, label, k):
        self.data = [f'{label}_{i}' for i in range(k)]
        self._length = int(1e12)
    
    def __len__(self):
        return self._length

    def __getitem__(self, i):
        data = self.data
        return data[i % len(data)]

In [498]:
class ComposedDataset(Dataset):
    def __init__(self, datasets, weights):
        self.datasets = datasets
        self._init_indices(datasets, weights)
        self._length = int(1e12)
    
    def _init_indices(self, datasets, weights):
        tot_w = sum(weights)
        freqs = [int(len(datasets) * 200 * w/tot_w) for w in weights]
        count = sum(freqs)
        indices = [None] * count
        dataset_ids = list(range(len(datasets)))
        dataset_ids.sort(key = lambda i : freqs[i], reverse=True)
        available_slots = list(range(count))
        for i in dataset_ids:
            assert len(available_slots) >= freqs[i]
            step = len(available_slots) / freqs[i]
            for j in range(freqs[i]):
                jj = int(j * step)
                indices[available_slots[jj]] = i
            available_slots = [s for s in available_slots if indices[s] is None]
        indices = [i for i in indices if i is not None]
        
        dataset_counts = [[0] * len(indices) for _ in range(len(datasets))]
        for i in range(len(datasets)):
            for j in range(len(indices)):
                dataset_counts[i][j] = (indices[j] == i) + (dataset_counts[i][j-1] if j > 0 else 0)
            assert dataset_counts[i][-1] > 0, (i, dataset_counts[i], indices)

        self.indices = indices
        self.counts = dataset_counts
    
    def __len__(self):
        return self._length
    
    def __getitem__(self, i):
        indices = self.indices        
        ii = i % len(indices)
        idx = indices[ii]
        assert idx < len(self.datasets)
        counts = self.counts[idx]
        j = (i // len(indices)) * counts[-1] + (counts[ii - 1] if ii > 0 else 0)
        assert j < len(self.datasets[idx])
        return self.datasets[idx][j]

In [494]:
datasetA = AtomicDataset('A', 5123)

In [495]:
datasetB = AtomicDataset('B', 1022)

In [496]:
datasetC = AtomicDataset('C', 3033)

In [505]:
datasetD = AtomicDataset('D', 232)

In [508]:
complex_dataset = ComposedDataset([datasetA, datasetB, datasetC], [5, 7, 2000])

freqs =  [1, 2, 596]


In [506]:
complex_dataset_2 = ComposedDataset([complex_dataset, datasetD], [1, 1])

freqs =  [200, 200]


In [507]:
[complex_dataset_2[i] for i in range(50)]

['C_0',
 'D_0',
 'C_1',
 'D_1',
 'B_0',
 'D_2',
 'C_2',
 'D_3',
 'C_3',
 'D_4',
 'B_1',
 'D_5',
 'C_4',
 'D_6',
 'C_5',
 'D_7',
 'A_0',
 'D_8',
 'C_6',
 'D_9',
 'B_2',
 'D_10',
 'C_7',
 'D_11',
 'C_8',
 'D_12',
 'A_1',
 'D_13',
 'C_9',
 'D_14',
 'C_10',
 'D_15',
 'B_3',
 'D_16',
 'C_11',
 'D_17',
 'B_4',
 'D_18',
 'C_12',
 'D_19',
 'C_13',
 'D_20',
 'A_2',
 'D_21',
 'C_14',
 'D_22',
 'C_15',
 'D_23',
 'B_5',
 'D_24']

In [510]:
import math

In [576]:
def normalize_weights(ws):
    w_sum = sum(ws)
    ws = [w/w_sum for w in ws]
    ws = [math.log(1 + w * 2e3) for w in ws]
    w_sum = sum(ws)
    ws = [w/w_sum for w in ws]
    return ws

In [580]:
normalize_weights([5000, 100])

[0.672374888615261, 0.32762511138473915]

In [554]:
x = [math.log(1 + x * 0.15) for x in [4e4, 5, 100]]
x_sum = sum(x)
x, [y/x_sum for y in x]

([8.699681400989514, 0.5596157879354227, 2.772588722239781],
 [0.7230521852702111, 0.04651106169616683, 0.2304367530336221])