In [None]:
import os, shutil
import pandas as pd
from onekey_algo import get_param_in_cwd

data_root = get_param_in_cwd('data_root')
label_data = pd.read_csv(os.path.join(data_root, 'label.csv'))
label_data['label'] = label_data['EGFR突变状态'].map(lambda x: 1 if '有' in x else 0)
label_data['ID'] = label_data['姓名'].map(lambda x: x.strip().replace(' ', ''))
label_data = label_data.drop_duplicates('ID').sort_values('ID')
label_data[['ID', 'label']].to_csv(os.path.join(get_param_in_cwd('data_root'), 'label_rmdup.csv'), index=False, encoding='utf-8-sig')
label_data

In [None]:
import re

patches_root = os.path.join(data_root, 'Pathology', 'patches')
sample_mapping = {}
for sample in os.listdir(patches_root):
    if not os.path.isdir(os.path.join(patches_root, sample)):
        continue
    if ' ' in sample:
        nsample = sample.split(' ')[0]
    else:
        nsample = re.sub(r'\d+', '', sample)
    sample_mapping[sample] = nsample
    
# if len(set(sample_mapping.values()) - set(label_data['ID'])) == 0:
#     for k, v in sample_mapping.items():
#         shutil.move(os.path.join(patches_root, k), os.path.join(patches_root, v))

# pd.DataFrame(sample_mapping.items(), columns=['ori', 'mapping']).to_csv(os.path.join(data_root, 'path_mapping.csv'), index=False)

In [None]:
from glob import glob
import nibabel as nib
import numpy as np

radio_dir = os.path.join(data_root, 'Radiology', 'nii', 'both')
image_dir = os.path.join(data_root, 'Radiology', 'images')
masks_dir = os.path.join(data_root, 'Radiology', 'masks')
for sample in os.listdir(radio_dir):
    if os.path.exists(os.path.join(image_dir, f"{sample}.nii.gz")):
        continue
    images = sorted(glob(os.path.join(radio_dir, sample, '*.nii.gz')))
    masks = sorted(glob(os.path.join(radio_dir, sample, '*', '*.nii.gz')))
    has = False
    for i, m in zip(images, masks):
        iarr = np.array(nib.load(i).dataobj)
        marr = np.array(nib.load(m).dataobj)
        if iarr.shape == marr.shape:
            has = True
            shutil.copy(i, os.path.join(image_dir, f"{sample}.nii.gz"))
            shutil.copy(m, os.path.join(masks_dir, f"{sample}.nii.gz"))
            break
    if not has:
        print(sample, 'Error')

In [None]:
from onekey_algo.custom.components.Radiology import get_image_mask_from_dir, diagnose_3d_image_mask_settings

ims, mss = get_image_mask_from_dir(os.path.join(data_root, 'Radiology'), 'images', 'masks')
info = diagnose_3d_image_mask_settings(ims, mss, verbose=True)

In [None]:
set(label_data['ID']) - set(map(lambda x: x.replace('.nii.gz', ''), os.listdir(masks_dir))) - set(os.listdir(patches_root))

# 划分数据集

In [None]:
import pandas as pd
import os
import numpy as np
from onekey_algo.custom.components.comp2 import split_dataset4sol
from onekey_algo import get_param_in_cwd

root = get_param_in_cwd('data_root')
label_data = pd.read_csv(os.path.join(root, 'label_rmdup.csv'))

patches = pd.DataFrame(glob(os.path.join(root, 'Pathology', 'patches', '*', '*.jpg')), columns=['fpath'])
patches['ID'] = patches['fpath'].map(lambda x: os.path.basename(os.path.dirname(x)))
patches['filename'] = patches['fpath'].map(lambda x: os.path.basename(x))
patches                            

In [None]:
import pandas as pd
import os
import numpy as np
from onekey_algo.custom.components.comp2 import split_dataset4sol
from onekey_algo import get_param_in_cwd      

ds = split_dataset4sol(label_data, y_data=label_data['label'], n_trails=5, save_dir=os.path.join(root), 
                       random_state=0, cv=False, test_size=0.3, map_ext='.nii.gz')

for idx, (train, val) in enumerate(ds):
    train = pd.merge(train, patches[['ID', 'fpath']], on='ID', how='inner')
    train[['fpath', 'label']].to_csv(os.path.join(root, 'split_info', f'Path_train-RND-{idx}.txt'), index=False, header=False, sep='\t')
    val = pd.merge(val, patches[['ID', 'fpath']], on='ID', how='inner')
    val[['fpath', 'label']].to_csv(os.path.join(root, 'split_info', f'Path_val-RND-{idx}.txt'), index=False, header=False, sep='\t')
    print(f"随机划分：{idx}，训练集：{train.shape[0]}, {len(np.unique(train['ID']))}, 测试集集：{val.shape[0]}, {len(np.unique(val['ID']))}")

In [None]:
# import pandas as pd
# path_group = pd.read_csv('Pathology/group.csv')
# path_group['ID'] = path_group['ID'].map(lambda x: f"{x}.nii.gz")
# rad_group = pd.read_csv('group.csv')
# group = pd.concat([path_group, rad_group], axis=0).drop_duplicates('ID')
# print(group['group'].value_counts())
# group