In [1]:
import concurrent.futures
import json
import logging
import os
import traceback
from pathlib import Path
from typing import Union, List

import numpy as np
import pandas as pd
import radiomics
import yaml
from radiomics import featureextractor
import nibabel as nib

import logging
import os
import time
from configparser import ConfigParser
from datetime import datetime

from termcolor import colored

logger = logging.root
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("[%(asctime)s - %(filename)s:%(lineno)4s]\t%(levelname)s\t%(message)s",
                                       '%Y-%m-%d %H:%M:%S'))
logger.handlers = [handler]
PIPE_HINT = colored('PIPE INPUT SUPPORTED!', 'green', attrs=['blink'])
REQUIRED = colored('REQUIRED!', 'red', attrs=['blink'])

In [2]:
#
radiomics.logger.setLevel(logging.ERROR)
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'


def judge_image_mask_match(ipath, mpath):
    ir = np.array(nib.load(ipath).dataobj)
    mr = np.array(nib.load(mpath).dataobj)
    return ir.shape == mr.shape


import csv
def  get_image_mask_from_dir(path, csv_path, limit: int = None):
    items = os.listdir(path)
    with open(csv_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        label_dict = {row['ID']: row['label'] for row in reader}
    assert 'images' in items and 'masks' in items
    images_path = Path(os.path.join(path, 'images'))
    masks_path = Path(os.path.join(path, 'masks'))
    base_name = []
    images = []
    masks = []
    labels = []
    for l_ in os.listdir(images_path):
        if not l_.startswith('.'):

            f_name, _ = os.path.splitext(l_)
            mask_file = list(masks_path.glob(f_name + '*'))
            if len(mask_file) == 1:
                base_name.append(l_)
                images.append(os.path.abspath(os.path.join(images_path, l_)))
                masks.append(os.path.abspath(mask_file[0]))
                labels.append(label_dict.get(f_name, None))
    return images[:limit], masks[:limit], labels[:limit], base_name[:limit]



def get_pair_from_2dir(xpath, ypath, strict: bool = True):
    assert os.path.isdir(xpath) and os.path.isdir(ypath)
    images = []
    masks = []
    xpath = Path(xpath)
    ypath = Path(ypath)
    if strict:
        for l_ in os.listdir(xpath):
            if not l_.startswith('.'):
                f_name, _ = os.path.splitext(l_)
                mask_file = [str(p) for p in ypath.glob(f_name + '*')]
                if len(mask_file) == 1:
                    images.append(os.path.abspath(os.path.join(xpath, l_)))
                    masks.append(os.path.abspath(mask_file[0]))
                else:
                    if os.path.join(ypath, l_) in mask_file:
                        images.append(os.path.abspath(os.path.join(xpath, l_)))
                        masks.append(os.path.abspath(os.path.join(ypath, l_)))
    else:
        images = sorted([os.path.join(xpath, i) for i in os.listdir(xpath) if not i.startswith('.')])
        masks = sorted([os.path.join(ypath, i) for i in os.listdir(ypath) if not i.startswith('.')])
    assert len(images) == len(masks), "获取的图像和mask数量不匹配"
    return images, masks


def diagnose_3d_image_mask_settings(ipath, mpath, assume_masks: List[int] = None, verbose: bool = False):
    """检查 Pyradiomics 特征提取的数据是否符合要求。
    Args:
        ipath: images的集合，list
        mpath: masks的集合，list
        assume_masks: 预定mask包括的label集合。
        verbose: 是否打印中间结果日志。

    Returns: 没有错误的images，masks

    """
    diagnose = []
    label_set = set()
    join_label_set = None
    correct_images = []
    correct_masks = []
    if len(ipath) != len(mpath):
        diagnose.append(f"图像和Mask的数量不相等，检查图像数据量和Mask数据量。")
    for i, m in zip(ipath, mpath):
        if not (os.path.exists(i) and os.path.isfile(i)):
            diagnose.append(f"图像文件：{i}不存在！")
        if not (os.path.exists(m) and os.path.isfile(m)):
            diagnose.append(f"Mask文件：{m}不存在！")
        bi = os.path.basename(i)
        bm = os.path.basename(m)
        try:
            image = nibabel.load(i).get_data()
            mask = nibabel.load(m).get_data()
            mask_labels = np.unique(mask)
            if verbose:
                label_set |= set(mask_labels)
                if join_label_set is None:
                    join_label_set = set(mask_labels)
                join_label_set &= set(mask_labels)
                logger.info(f'正在检查：{bi}{image.shape}和{bm}{mask.shape}，标签集合：{mask_labels}')

            # import numpy as np
            # print(np.unique(mask.get_data()))
            test_pass = True
            if not image.shape == mask.shape:
                test_pass = False
                diagnose.append(f"图像 {bi}({image.shape}) 和Mask {bm}({mask.shape})的尺寸不匹配")
            if assume_masks and sorted(mask_labels) != sorted(assume_masks):
                test_pass = False
                diagnose.append(f"Mask: {bm}的labels（{mask_labels[:3]}...）与预期（{assume_masks}）不同")
            if len(image.shape) not in (2, 3):
                test_pass = False
                diagnose.append(f"图像 {bi} 和Mask {bm}不是2D或者3D数据")
            if test_pass:
                correct_images.append(i)
                correct_masks.append(m)
        except Exception as e:
            traceback.print_exc()
            diagnose.append(f"图像 {bi} 和Mask {bm} 存在{e}")
    if not diagnose:
        print('检查通过！')
    else:
        print('请检查如下设置：')
        for idx, d in enumerate(diagnose):
            print(f"问题{idx + 1}： {d}")
    if verbose:
        print(f'标签集合为：{label_set}, 共有标签为：{join_label_set}')
    return correct_images, correct_masks, diagnose


class ConventionalRadiomics(object):
    def __init__(self, params_file: str = None, **params):
        settings = {}
        self.params_file = params_file
        if params_file is not None:
            if os.path.exists(params_file):
                _, ext = os.path.splitext(params_file)
                with open(params_file) as pf:
                    if ext.lower() == '.json':
                        logger.info(f"使用{params_file}的配置文件。")
                        settings = json.loads(pf.read())
                    elif ext.lower() == '.yaml':
                        settings = yaml.load(pf.read(), Loader=yaml.FullLoader)
                    else:
                        raise ValueError(f"Parameters file {params_file}'s format({ext}) not found!")
                    logger.info(f"使用{params_file}的配置文件。")
            else:
                logger.warning(f"{params_file}文件不存在，我们将放弃使用配置文件！")
        self.params = params
        self.settings = settings
        self._features = {}
        self.feature_names = set()
        self.statics_names = set()
        self.extractor = None
        self.df = None
        self.errors = []

        # Initialize feature extractor
        self.extractor = self.init_extractor(self.settings)

    def init_extractor(self, settings=None):
        settings = settings or self.settings
        settings = {}
        settings['binWidth'] = 25
        settings['sigma'] = [3, 5]
        settings['resampledPixelSpacing'] = [1,1,1] # 3,3,3
        settings['voxelArrayShift'] = 1000 # 300
        settings['normalize'] = True
        settings['normalizeScale'] = 100

        # 实例化特征提取器
        extractor = featureextractor.RadiomicsFeatureExtractor(**settings)

        # 指定使用 LoG 和 Wavelet 滤波器
        extractor.enableImageTypeByName('LoG')
        extractor.enableImageTypeByName('Wavelet')
        # 所有类型
        extractor.enableAllFeatures()
        extractor.enableFeaturesByName(firstorder=['Energy', 'TotalEnergy', 'Entropy','Minimum', '10Percentile', '90Percentile',
                                                         'Maximum', 'Mean', 'Median', 'InterquartileRange', 'Range',
                                                         'MeanAbsoluteDeviation', 'RobustMeanAbsoluteDeviation','RootMeanSquared',
                                                         'StandardDeviation', 'Skewness', 'Kurtosis', 'Variance', 'Uniformity'])
        extractor.enableFeaturesByName(shape=['VoxelVolume', 'MeshVolume', 'SurfaceArea', 'SurfaceVolumeRatio', 'Compactness1', 'Compactness2', 
                                                    'Sphericity', 'SphericalDisproportion',  'Maximum3DDiameter', 'Maximum2DDiameterSlice', 
                                                    'Maximum2DDiameterColumn', 'Maximum2DDiameterRow', 
                                                    'MajorAxisLength', 'MinorAxisLength', 'LeastAxisLength', 'Elongation', 'Flatness'])

        return extractor

    def extract(self, images: Union[str, List[str]], masks: Union[str, List[str]],
                labels: Union[int, List[int]] = 1, settings=None, workers: int = 1):
        """

        Args:
            images:
            masks:
            labels:
            settings:
            workers:

        Returns:

        """
        logger.info('Extracting features...')
        if settings is not None:
            extractor = self.init_extractor(settings)
        else:
            extractor = self.extractor
        if not isinstance(images, (list, tuple)):
            images = [images]
        if not isinstance(masks, (list, tuple)):
            masks = [masks]
        assert len(images) == len(masks), '图像和标注数据必须一一对应。'
        if not isinstance(labels, (list, tuple)):
            labels = [labels]
        if workers == 1:
            for image, mask in zip(images, masks):
                image_name = os.path.basename(image)
                self._features[image_name] = {}
                for label in labels:
                    try:
                        statics = {}
                        features = {}
                        logger.info(f'\tExtracting feature from {image} using label {label}')
                        featureVector = extractor.execute(image, mask, label=label)
                        for featureName in featureVector.keys():
                            f_type, c_name, f_name = featureName.split('_')
                            if f_type == 'diagnostics':
                                self.statics_names.add(f"{f_type}_{c_name}_{f_name}")
                                if f"{f_type}_{c_name}" not in statics:
                                    statics[f"{f_type}_{c_name}"] = {}
                                statics[f"{f_type}_{c_name}"].update({f_name: featureVector[featureName]})
                            else:
                                self.feature_names.add(f"{f_type}_{c_name}_{f_name}")
                                if f"{f_type}_{c_name}" not in features:
                                    features[f"{f_type}_{c_name}"] = {}
                                features[f"{f_type}_{c_name}"].update({f_name: float(featureVector[featureName])})
                        self._features[image_name][label] = {"statics": statics, 'features': features}
                    except Exception as e:
                        logger.error(f"{image_name} extract {label} error, {e}")
                        self.errors.append((image_name, label, e))
            # print(json.dumps(self._features, indent=True))
        elif workers > 1:
            with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor:
                parallel_labels = [labels] * len(images)
                parallel_settings = [settings] * len(images)
                results = executor.map(self.extract_unit, images, masks, parallel_labels, parallel_settings)
                for _f, fn, sn in results:
                    self.statics_names |= sn
                    self.feature_names |= fn
                    self._features.update(_f)
        logger.info(f'特征提取完成！')
        return self._features

    def extract_unit(self, images: Union[str, List[str]], masks: Union[str, List[str]],
                     labels: Union[int, List[int]] = 1, settings=None):
        _features = {}
        feature_names = set()
        statics_names = set()
        if settings is not None:
            extractor = self.init_extractor(settings)
        else:
            extractor = self.extractor
        if not isinstance(images, (list, tuple)):
            images = [images]
        if not isinstance(masks, (list, tuple)):
            masks = [masks]
        assert len(images) == len(masks), '图像和标注数据必须一一对应。'
        if not isinstance(labels, (list, tuple)):
            labels = [labels]
        for image, mask in zip(images, masks):
            image_name = os.path.basename(image)
            _features[image_name] = {}
            for label in labels:
                try:
                    statics = {}
                    features = {}
                    logger.info(f'\tExtracting feature from {image} using label {label}')
                    featureVector = extractor.execute(image, mask, label=label)
                    for featureName in featureVector.keys():
                        f_type, c_name, f_name = featureName.split('_')
                        if f_type == 'diagnostics':
                            statics_names.add(f"{f_type}_{c_name}_{f_name}")
                            if f"{f_type}_{c_name}" not in statics:
                                statics[f"{f_type}_{c_name}"] = {}
                            statics[f"{f_type}_{c_name}"].update({f_name: featureVector[featureName]})
                        else:
                            feature_names.add(f"{f_type}_{c_name}_{f_name}")
                            if f"{f_type}_{c_name}" not in features:
                                features[f"{f_type}_{c_name}"] = {}
                            features[f"{f_type}_{c_name}"].update({f_name: float(featureVector[featureName])})
                    _features[image_name][label] = {"statics": statics, 'features': features}
                except Exception as e:
                    logger.error(f"{image_name} extract {label} error, {e}")
        # print(json.dumps(self._features, indent=True))
        return _features, feature_names, statics_names

    @property
    def features(self, labels: Union[list, tuple, set] = None):
        if self._features:
            feature = {}
            for k_, v_ in self._features.items():
                feature[k_] = {l_: f_['features'] for l_, f_ in v_.items() if labels is None or l_ in labels}
            return feature
        else:
            logger.warning(f'No features found! Perhaps you should input images and masks!')

    @property
    def statics(self, labels: Union[list, tuple, set] = None):
        if self._features:
            statics = {}
            for k_, v_ in self._features.items():
                statics[k_] = {l_: f_['statics'] for l_, f_ in v_.items() if labels is None or l_ in labels}
            return statics
        else:
            logger.warning(f'No features found! Perhaps you should input images and masks!')

    def get_label_data_frame(self, label: int = 1, column_names=None, ftype='features'):
        if ftype == 'features':
            column_names = column_names or sorted(list(self.feature_names))
            features_dict = self.features.items()
        else:
            column_names = column_names or sorted(list(self.statics_names))
            features_dict = self.statics.items()
        not_has = set()
        for k_, v_ in features_dict:
            if v_ and label in v_:
                for name in column_names:
                    f_type, c_name, f_name = name.split('_')
                    if f"{f_type}_{c_name}" not in v_[label]:
                        not_has.add(name)
        column_names = sorted(list(set(column_names) - not_has))
        if not_has:
            logger.warning(f"存在某些特征{not_has}在提取的时候并不是出现在所有样本中，一般可以忽略这个问题。")
        indexes = []
        df = []
        for k_, v_ in features_dict:
            if v_:
                data = [k_]
                if label in v_:
                    indexes.append(k_)
                    for name in column_names:
                        f_type, c_name, f_name = name.split('_')
                        data.append(v_[label][f"{f_type}_{c_name}"][f_name])
                    df.append(data)
                else:
                    logger.warning(f"{k_}的label={label}没有计算出任何特征！"
                                   f"你可能需要修改：radiomics.extract(images, masks, labels=[{label}])")
        self.df = pd.DataFrame(df, columns=['ID'] + column_names, index=indexes)
        return self.df


In [None]:
from flask import Flask, make_response, request, jsonify
def form_matrix(data: pd.DataFrame):
    x_axis = [i for i in data.columns]
    y_axis = [i for i in data.index]
    return {'x': x_axis, 'y': y_axis, 'data': data.to_numpy().tolist()}
model_root = r'D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d'
images, masks, labels, base_name = get_image_mask_from_dir(model_root,'D:\data\data_chi_Mediastinum\label.csv')
conv_radiomics = ConventionalRadiomics()
if images and masks:
    print(f'Start extracting feature from {model_root}')
    conv_radiomics.extract(images, masks)#label是分割标签的编号，咱这个是二分割所以是1
    feature = conv_radiomics.get_label_data_frame()
    
feature['file_name'] = base_name
# 保存到 CSV 文件
csv_file_path = 'features.csv'

feature.to_csv(csv_file_path, index=False)
print(f"Feature data saved to {csv_file_path}")

[2024-11-20 10:50:44 - 268565349.py: 202]	INFO	Extracting features...
[2024-11-20 10:50:44 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\1.nii.gz using label 1


Start extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d


[2024-11-20 10:50:51 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\10.nii.gz using label 1
[2024-11-20 10:50:55 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\100.nii.gz using label 1
[2024-11-20 10:50:58 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\101.nii.gz using label 1
[2024-11-20 10:51:03 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\102.nii.gz using label 1
[2024-11-20 10:51:09 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\103.nii.gz using label 1
[2024-11-20 10:51:13 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\105.nii.gz using label 1
[2024-11-20 10:51:16 - 268565349.py: 222]	INFO		Extracting 

[2024-11-20 10:55:13 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\150.nii.gz using label 1
[2024-11-20 10:55:18 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\151.nii.gz using label 1
[2024-11-20 10:55:24 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\152.nii.gz using label 1
[2024-11-20 10:55:30 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\153.nii.gz using label 1
[2024-11-20 10:55:35 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\154.nii.gz using label 1
[2024-11-20 10:55:40 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\155.nii.gz using label 1
[2024-11-20 10:55:44 - 268565349.py: 222]	INFO		Extracting

[2024-11-20 10:59:33 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\202.nii.gz using label 1
[2024-11-20 10:59:40 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\203.nii.gz using label 1
[2024-11-20 10:59:46 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\204.nii.gz using label 1
[2024-11-20 10:59:53 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\205.nii.gz using label 1
[2024-11-20 11:00:02 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\206.nii.gz using label 1
[2024-11-20 11:00:09 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\207.nii.gz using label 1
[2024-11-20 11:00:15 - 268565349.py: 222]	INFO		Extracting

[2024-11-20 11:06:26 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\35.nii.gz using label 1
[2024-11-20 11:06:32 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\39.nii.gz using label 1
[2024-11-20 11:06:39 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\40.nii.gz using label 1
[2024-11-20 11:06:43 - 268565349.py: 222]	INFO		Extracting feature from D:\data\data_chi_Mediastinum\data_chi_Mediastinum_3d\images\41.nii.gz using label 1


In [None]:
import csv
import json

def read_fold_json(json_file):
    with open(json_file, 'r') as f:
        fold_data = json.load(f)
    return fold_data

def add_fold_column(csv_file, fold_data, output_file):
    with open(csv_file, 'r') as f:
        reader = csv.reader(f)
        data = list(reader)

    header = data[0]
    header.append('fold')

    filename_index = header.index('ID')

    for row in data[1:]:
        filename = row[filename_index]
        for fold, filenames in fold_data.items():
            if filename in filenames:
                row.append(fold)
                break

    with open(output_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(data)

if __name__ == "__main__":
    # 输入文件
    features_csv = 'features.csv'
    fold_json = 'modified_fold_mapping.json'
    output_csv = 'features_with_fold.csv'

    # 读取 JSON 文件
    fold_data = read_fold_json(fold_json)

    # 添加 Fold 列到 CSV 文件
    add_fold_column(features_csv, fold_data, output_csv)

    print(f"Successfully added fold column. Output file: {output_csv}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

# 1. 加载 features_with_fold.csv 文件
csv_file = 'features_with_fold.csv'
df = pd.read_csv(csv_file)

# 提取 ID、label 和 fold 列
ids = df['ID']
labels = df['label']
folds = df['fold']

# 假设特征从第3列开始到倒数第2列
X = df_features.drop(['ID', 'label', 'fold'], axis=1)  # 假设特征从第3列开始到最后一列
# y = df_features['label']

# 2. 特征选择（这里使用随机森林作为示例）
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, labels)

# 使用 SelectFromModel 选择特征，限定选中的特征数量为10个
max_features = 22
model = SelectFromModel(clf, max_features=max_features, prefit=True)
selected_features = model.transform(X)

# 获取选中特征的列索引
selected_feature_indices = model.get_support(indices=True)

# 3. 组合原始 ID、label 和 fold 列和选中的特征
selected_df = pd.DataFrame(selected_features, columns=df.columns[selected_feature_indices])
selected_df.insert(0, 'ID', ids)  # 插入原始 ID 列
selected_df['label'] = labels  # 添加 label 列
selected_df['fold'] = folds  # 添加 fold 列

# 4. 保存为新的 CSV 文件
selected_file = f'selected_features_{max_features}_with_id_label_fold.csv'
selected_df.to_csv(selected_file, index=False)

print(f"保存选中特征的文件: {selected_file}")
