## Key2

对于病理的任务，将病理的所有tiles转化成histogram或者tfidf的patient特征。

![](http://www.medai.icu/storage/attachments/2022/06/26/n41q4HeDjvIOZnyfoKH28c5YNioGdB7OZwO35XOf.png)

参考论文: [Development and interpretation of a pathomics-based model for the prediction of microsatellite instability in Colorectal Cancer](http://www.medai.icu/download?url=http://www.medai.icu/apiv3/attachment.download?sign=1667478d908313ae1e01543e229d02de&attachmentsId=1061&threadId=230)

In [2]:
import pandas as pd
from onekey_algo.custom.utils import key2
import numpy as np
import os
from onekey_algo import get_param_in_cwd

def map2n(x):
    try:
        return int(x)
    except:
        return 0
    
model = 'resnet50'
train_log_path = rf'{get_param_in_cwd("model_root")}/1/{model}/viz/BST_TRAIN_RESULTS.txt'
train_log = pd.read_csv(train_log_path, names=['fname', 'prob', 'pred', 'gt'], sep='\t')
# train_log['pred'] = train_log['pred'].map(lambda x: map2n(x))
# val_log_path = rf'{get_param_in_cwd("model_root")}/1/{model}/viz/BST_VAL_RESULTS.txt'
val_log_path = 'data/predictions.txt'
val_log = pd.read_csv(val_log_path, names=['fname', 'prob', 'pred', 'gt'], sep='\t')
val_log['pred'] = val_log['pred'].map(lambda x: map2n(x))

log = pd.concat([train_log, val_log], axis=0)
log['prob'] = list(map(lambda x: x[0] if x[1] == 1 else 1-x[0], np.array(log[['prob', 'pred']])))
log['prob'] = log['prob'].round(decimals=2)
log[['group']] = log[['fname']].applymap(lambda x:os.path.basename(x).split('_')[0][:12])
log

Unnamed: 0,fname,prob,pred,gt,group
0,F:\20230608-XueYuHang\Pathology\patches_norm\T...,0.02,0,0,TCGA-BR-8589
1,F:\20230608-XueYuHang\Pathology\patches_norm\T...,0.00,0,0,TCGA-BR-6455
2,F:\20230608-XueYuHang\Pathology\patches_norm\T...,0.04,0,0,TCGA-BR-6852
3,F:\20230608-XueYuHang\Pathology\patches_norm\T...,0.98,1,1,TCGA-BR-6565
4,F:\20230608-XueYuHang\Pathology\patches_norm\T...,1.00,1,1,TCGA-CD-8532
...,...,...,...,...,...
25079,F:\20230608-XueYuHang\Pathology\patches_norm\T...,1.00,1,0,TCGA-VQ-AA6J
25080,F:\20230608-XueYuHang\Pathology\patches_norm\T...,0.99,1,0,TCGA-VQ-AA6J
25081,F:\20230608-XueYuHang\Pathology\patches_norm\T...,0.44,0,0,TCGA-VQ-AA6J
25082,F:\20230608-XueYuHang\Pathology\patches_norm\T...,0.55,1,0,TCGA-VQ-AA6J


In [3]:
log['pred'].value_counts()

0    29427
1    10225
Name: pred, dtype: int64

In [4]:
set(os.listdir(r'F:\20230608-XueYuHang\Pathology\patches_norm')) - set(log['group'])

{'TCGA-BR-4253-01Z-00-DX1.1541e883-de6a-4cd3-a6f5-6866fd5f3439',
 'TCGA-BR-4267-01Z-00-DX1.74687868-fdfb-4b11-8c72-a42703b5d06d',
 'TCGA-BR-6452-01Z-00-DX1.aec8246e-9dce-436d-9faa-df8fa1f6e47c',
 'TCGA-BR-6453-01Z-00-DX1.2e52d64c-e34f-47a9-9e5e-8954b5ef18ca',
 'TCGA-BR-6455-01Z-00-DX1.402682fe-98b4-4381-85da-98510db17af9',
 'TCGA-BR-6456-01Z-00-DX1.2cb7b656-95e1-4f34-99e6-77a051fb79e5',
 'TCGA-BR-6563-01Z-00-DX1.2398c0e0-47a6-4557-8e2d-3b890756900a',
 'TCGA-BR-6564-01Z-00-DX1.ad86e83f-ebfd-4eee-8bf8-7490c5f79a17',
 'TCGA-BR-6565-01Z-00-DX1.0e53f881-e0d7-4906-946a-67f612906476',
 'TCGA-BR-6566-01Z-00-DX1.908c1c0f-12a4-4fa8-a398-ca3b033e7496',
 'TCGA-BR-6705-01Z-00-DX1.1e145e40-56d6-4ebd-9a9b-bc5a0cbe3247',
 'TCGA-BR-6801-01Z-00-DX1.7d2aaa44-16f0-4526-87f2-6e95cdb5df3b',
 'TCGA-BR-6803-01Z-00-DX1.0704a32a-4a03-4c67-96ad-4888ea899c70',
 'TCGA-BR-6852-01Z-00-DX1.9cdca0ad-9351-42c2-8082-3742de914369',
 'TCGA-BR-7196-01Z-00-DX1.651ae8f6-2d2e-4520-9419-29b957e85d03',
 'TCGA-BR-7197-01Z-00-DX1

### 直方图

```python
def key2histogram(data: pd.DataFrame, group_column: str, histo_columns: Union[str, List[str]],
                  histo_lists: Union[list, List[list]] = None, default_value=0, norm: bool = False):
    """
    所有的数据生成直方图特征， 多个histo_columns存在是，所有的特征进行横向拼接。
    Args:
        data: 数据
        group_column: 样本分组的列明，一般为ID
        histo_columns: 用来计算直方图的列，如果为多列，则每列计算完直方图，然后特征拼接
        histo_lists: None或者与histo_columns个数相同，为自己指定特征列表
        default_value: 不存在特征时的默认值
        norm: 要不要归一化。
    Returns:

    """
```

In [5]:
import os

os.makedirs('features', exist_ok=True)
results = key2.key2histogram(log, group_column='group',histo_columns='prob', norm=True)
results.to_csv('features/path_prob_histogram2.csv', header=True, index=False)
display(results)

results = key2.key2histogram(log, group_column='group',histo_columns='pred', norm=True)
results.to_csv('features/path_pred_histogram2.csv', header=True, index=False)
display(results)

[2024-02-07 12:24:21 - key2.py:  59]	INFO	一共有166个样本。


Unnamed: 0,ID,prob-0.0,prob-0.01,prob-0.02,prob-0.03,prob-0.04,prob-0.05,prob-0.06,prob-0.07,prob-0.08,...,prob-0.91,prob-0.92,prob-0.93,prob-0.94,prob-0.95,prob-0.96,prob-0.97,prob-0.98,prob-0.99,prob-1.0
0,TCGA-BR-6452,0.610,0.220,0.033,0.033,0.008,0.008,0.037,0.004,0.008,...,0.000e+00,0.000,0.000e+00,0.000e+00,0.000,0.000e+00,0.000,0.000,0.000,0.000
1,TCGA-BR-6453,0.486,0.197,0.069,0.032,0.035,0.020,0.023,0.009,0.020,...,0.000e+00,0.000,0.000e+00,0.000e+00,0.000,0.000e+00,0.000,0.000,0.000,0.000
2,TCGA-BR-6455,0.744,0.081,0.036,0.022,0.010,0.012,0.009,0.007,0.007,...,5.040e-04,0.000,5.040e-04,5.040e-04,0.000,5.040e-04,0.000,0.000,0.000,0.000
3,TCGA-BR-6456,0.495,0.133,0.065,0.030,0.018,0.016,0.012,0.016,0.000,...,0.000e+00,0.000,2.020e-03,2.020e-03,0.000,2.020e-03,0.000,0.000,0.006,0.000
4,TCGA-BR-6563,0.323,0.156,0.074,0.043,0.038,0.038,0.015,0.013,0.016,...,0.000e+00,0.000,0.000e+00,2.439e-03,0.000,1.220e-03,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,TCGA-VQ-A925,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,3.049e-02,0.012,2.439e-02,2.439e-02,0.049,6.098e-02,0.055,0.098,0.061,0.061
162,TCGA-VQ-A927,0.000,0.000,0.000,0.025,0.025,0.000,0.025,0.050,0.000,...,0.000e+00,0.000,0.000e+00,2.500e-02,0.100,5.000e-02,0.025,0.050,0.050,0.100
163,TCGA-VQ-A92D,0.225,0.125,0.050,0.100,0.050,0.033,0.042,0.017,0.008,...,8.333e-03,0.000,0.000e+00,0.000e+00,0.000,8.333e-03,0.000,0.000,0.008,0.000
164,TCGA-VQ-A94O,0.320,0.150,0.065,0.062,0.032,0.029,0.035,0.024,0.020,...,3.003e-03,0.000,0.000e+00,0.000e+00,0.000,0.000e+00,0.000,0.002,0.000,0.000


[2024-02-07 12:24:22 - key2.py:  59]	INFO	一共有166个样本。


Unnamed: 0,ID,pred-0,pred-1
0,TCGA-BR-6452,1.000,0.000
1,TCGA-BR-6453,0.994,0.006
2,TCGA-BR-6455,0.982,0.018
3,TCGA-BR-6456,0.943,0.057
4,TCGA-BR-6563,0.948,0.052
...,...,...,...
161,TCGA-VQ-A925,0.146,0.854
162,TCGA-VQ-A927,0.275,0.725
163,TCGA-VQ-A92D,0.892,0.108
164,TCGA-VQ-A94O,0.959,0.041


### TF-IDF

```python
def key2tfidf(data: pd.DataFrame, group_column: str, corpus_columns: Union[str, List[str]]):
    """
    所有的数据生成直方图特征， 多个corpus_columns存在是，所有的特征进行横向拼接。
    Args:
        data: 数据
        group_column: 样本分组的列明，一般为ID
        corpus_columns: 用来计算作为语料的列明。
    Returns:

    """
```

In [6]:
results = key2.key2tfidf(log, group_column='group',corpus_columns='prob')
results.to_csv('features/path_prob_tfidf2.csv', header=True, index=False)
display(results)

results = key2.key2tfidf(log, group_column='group',corpus_columns='pred')
results.to_csv('features/path_pred_tfidf2.csv', header=True, index=False)
display(results)

Unnamed: 0,ID,prob00,prob001,prob002,prob003,prob004,prob005,prob006,prob007,prob008,...,prob091,prob092,prob093,prob094,prob095,prob096,prob097,prob098,prob099,prob10
TCGA-BR-6452,TCGA-BR-6452,0.936,0.336,0.050,0.051,0.014,0.014,0.062,0.007,0.015,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
TCGA-BR-6453,TCGA-BR-6453,0.911,0.367,0.127,0.060,0.070,0.042,0.047,0.018,0.044,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
TCGA-BR-6455,TCGA-BR-6455,0.992,0.107,0.047,0.029,0.014,0.017,0.012,0.010,0.010,...,0.001,0.000,0.001,0.001,0.000,0.001,0.000,0.000,0.000,0.000
TCGA-BR-6456,TCGA-BR-6456,0.950,0.254,0.121,0.058,0.038,0.035,0.025,0.033,0.000,...,0.000,0.000,0.007,0.007,0.000,0.006,0.000,0.000,0.020,0.000
TCGA-BR-6563,TCGA-BR-6563,0.850,0.408,0.191,0.112,0.107,0.111,0.042,0.038,0.048,...,0.000,0.000,0.000,0.011,0.000,0.005,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-VQ-A925,TCGA-VQ-A925,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.165,0.068,0.133,0.130,0.257,0.312,0.281,0.496,0.318,0.340
TCGA-VQ-A927,TCGA-VQ-A927,0.000,0.000,0.000,0.068,0.074,0.000,0.074,0.147,0.000,...,0.000,0.000,0.000,0.118,0.467,0.227,0.114,0.225,0.232,0.494
TCGA-VQ-A92D,TCGA-VQ-A92D,0.731,0.403,0.158,0.325,0.175,0.121,0.146,0.058,0.031,...,0.048,0.000,0.000,0.000,0.000,0.045,0.000,0.000,0.046,0.000
TCGA-VQ-A94O,TCGA-VQ-A94O,0.851,0.397,0.168,0.164,0.090,0.085,0.099,0.069,0.060,...,0.014,0.000,0.000,0.000,0.000,0.000,0.000,0.007,0.000,0.000


Unnamed: 0,ID,pred0,pred1
TCGA-BR-6452,TCGA-BR-6452,1.000,0.000
TCGA-BR-6453,TCGA-BR-6453,1.000,0.007
TCGA-BR-6455,TCGA-BR-6455,1.000,0.023
TCGA-BR-6456,TCGA-BR-6456,0.997,0.074
TCGA-BR-6563,TCGA-BR-6563,0.998,0.069
...,...,...,...
TCGA-VQ-A925,TCGA-VQ-A925,0.137,0.991
TCGA-VQ-A927,TCGA-VQ-A927,0.292,0.956
TCGA-VQ-A92D,TCGA-VQ-A92D,0.989,0.149
TCGA-VQ-A94O,TCGA-VQ-A94O,0.999,0.052
