## Key2

对于病理的任务，将病理的所有tiles转化成histogram或者tfidf的patient特征。

![](http://www.medai.icu/storage/attachments/2022/06/26/n41q4HeDjvIOZnyfoKH28c5YNioGdB7OZwO35XOf.png)

参考论文: [Development and interpretation of a pathomics-based model for the prediction of microsatellite instability in Colorectal Cancer](http://www.medai.icu/download?url=http://www.medai.icu/apiv3/attachment.download?sign=1667478d908313ae1e01543e229d02de&attachmentsId=1061&threadId=230)

In [4]:
import pandas as pd
from onekey_algo.custom.utils import key2
import numpy as np
import os

log = pd.read_csv(r'results/ALL_DL_PREDICTIONS.csv')
log.columns = ['group', 'prob', 'pred', 'gt']
log['prob'] = log['prob'].round(decimals=2)
log

Unnamed: 0,group,prob,pred,gt
0,primary-214.nii.gz,0.03,0,0
1,primary-214.nii.gz,0.10,0,0
2,primary-214.nii.gz,0.08,0,0
3,primary-214.nii.gz,0.10,0,0
4,primary-214.nii.gz,0.11,0,0
...,...,...,...,...
3469,secondary2-50.nii.gz,0.79,1,1
3470,secondary2-50.nii.gz,0.79,1,1
3471,secondary2-50.nii.gz,0.48,0,1
3472,secondary2-50.nii.gz,0.36,0,1


### 直方图

```python
def key2histogram(data: pd.DataFrame, group_column: str, histo_columns: Union[str, List[str]],
                  histo_lists: Union[list, List[list]] = None, default_value=0, norm: bool = False):
    """
    所有的数据生成直方图特征， 多个histo_columns存在是，所有的特征进行横向拼接。
    Args:
        data: 数据
        group_column: 样本分组的列明，一般为ID
        histo_columns: 用来计算直方图的列，如果为多列，则每列计算完直方图，然后特征拼接
        histo_lists: None或者与histo_columns个数相同，为自己指定特征列表
        default_value: 不存在特征时的默认值
        norm: 要不要归一化。
    Returns:

    """
```

In [5]:
os.makedirs('features', exist_ok=True)

results = key2.key2histogram(log, group_column='group',histo_columns='prob', norm=True)
results.to_csv('features/rad_prob_histogram.csv', header=True, index=False)
display(results)

results = key2.key2histogram(log, group_column='group',histo_columns='pred', norm=True)
results.to_csv('features/rad_pred_histogram.csv', header=True, index=False)
display(results)

[2024-09-15 15:55:39 - key2.py:  59]	INFO	一共有503个样本。


Unnamed: 0,ID,prob-0.0,prob-0.01,prob-0.02,prob-0.03,prob-0.04,prob-0.05,prob-0.06,prob-0.07,prob-0.08,...,prob-0.9,prob-0.91,prob-0.92,prob-0.93,prob-0.95,prob-0.96,prob-0.97,prob-0.98,prob-0.99,prob-1.0
0,primary-1.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.000,0.0,...,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,primary-10.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.000,0.0,...,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,primary-100.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.143,0.0,...,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,primary-101.nii.gz,0.0,0.0,0.429,0.143,0.143,0.0,0.143,0.143,0.0,...,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,primary-102.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.000,0.0,...,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,secondary2-70.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.000,0.0,...,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
499,secondary2-71.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.143,0.0,...,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
500,secondary2-72.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.000,0.0,...,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
501,secondary2-8.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.000,0.0,...,0.143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


[2024-09-15 15:55:39 - key2.py:  59]	INFO	一共有503个样本。


Unnamed: 0,ID,pred-0,pred-1
0,primary-1.nii.gz,0.286,0.714
1,primary-10.nii.gz,1.000,0.000
2,primary-100.nii.gz,1.000,0.000
3,primary-101.nii.gz,1.000,0.000
4,primary-102.nii.gz,1.000,0.000
...,...,...,...
498,secondary2-70.nii.gz,1.000,0.000
499,secondary2-71.nii.gz,1.000,0.000
500,secondary2-72.nii.gz,1.000,0.000
501,secondary2-8.nii.gz,0.857,0.143


### TF-IDF

```python
def key2tfidf(data: pd.DataFrame, group_column: str, corpus_columns: Union[str, List[str]]):
    """
    所有的数据生成直方图特征， 多个corpus_columns存在是，所有的特征进行横向拼接。
    Args:
        data: 数据
        group_column: 样本分组的列明，一般为ID
        corpus_columns: 用来计算作为语料的列明。
    Returns:

    """
```

In [6]:
results = key2.key2tfidf(log, group_column='group',corpus_columns='prob')
results.to_csv('features/rad_prob_tfidf.csv', header=True, index=False)
display(results)

results = key2.key2tfidf(log, group_column='group',corpus_columns='pred')
results.to_csv('features/rad_pred_tfidf.csv', header=True, index=False)
display(results)

Unnamed: 0,ID,prob00,prob001,prob002,prob003,prob004,prob005,prob006,prob007,prob008,...,prob09,prob091,prob092,prob093,prob095,prob096,prob097,prob098,prob099,prob10
primary-1.nii.gz,primary-1.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
primary-10.nii.gz,primary-10.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
primary-100.nii.gz,primary-100.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.314,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
primary-101.nii.gz,primary-101.nii.gz,0.0,0.0,0.898,0.261,0.232,0.0,0.193,0.185,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
primary-102.nii.gz,primary-102.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
secondary2-70.nii.gz,secondary2-70.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
secondary2-71.nii.gz,secondary2-71.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.324,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
secondary2-72.nii.gz,secondary2-72.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
secondary2-8.nii.gz,secondary2-8.nii.gz,0.0,0.0,0.000,0.000,0.000,0.0,0.000,0.000,0.0,...,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,ID,pred0,pred1
primary-1.nii.gz,primary-1.nii.gz,0.129,0.992
primary-10.nii.gz,primary-10.nii.gz,1.000,0.000
primary-100.nii.gz,primary-100.nii.gz,1.000,0.000
primary-101.nii.gz,primary-101.nii.gz,1.000,0.000
primary-102.nii.gz,primary-102.nii.gz,1.000,0.000
...,...,...,...
secondary2-70.nii.gz,secondary2-70.nii.gz,1.000,0.000
secondary2-71.nii.gz,secondary2-71.nii.gz,1.000,0.000
secondary2-72.nii.gz,secondary2-72.nii.gz,1.000,0.000
secondary2-8.nii.gz,secondary2-8.nii.gz,0.891,0.455
