## Key2

对于病理的任务，将病理的所有tiles转化成histogram或者tfidf的patient特征。

![](http://www.medai.icu/storage/attachments/2022/06/26/n41q4HeDjvIOZnyfoKH28c5YNioGdB7OZwO35XOf.png)

参考论文: [Development and interpretation of a pathomics-based model for the prediction of microsatellite instability in Colorectal Cancer](http://www.medai.icu/download?url=http://www.medai.icu/apiv3/attachment.download?sign=1667478d908313ae1e01543e229d02de&attachmentsId=1061&threadId=230)

In [None]:
import pandas as pd
from onekey_algo.custom.utils import key2
import numpy as np
import os
from onekey_algo import get_param_in_cwd

def map2n(x):
    try:
        return int(x)
    except:
        return 0
    
model = 'densenet121'
sm = {'inception_v3': 3, 'resnet50': 5, 'resnet101': 3, 'densenet121': 4, 'vgg19': 5}
model_root = os.path.join(get_param_in_cwd('data_root'), 'models')
train_log_path = rf'{model_root}/{model}/viz/BST_TRAIN_RESULTS.txt'
# train_log_path = rf'{get_param_in_cwd("model_root")}/{model}/train/Epoch-{sm[model]}.txt'
train_log = pd.read_csv(train_log_path, names=['fname', 'prob', 'pred', 'gt'], sep='\t')
train_log['pred'] = train_log['pred'].map(lambda x: map2n(x))
val_log_path = rf'{model_root}/{model}/viz/BST_VAL_RESULTS.txt'
# val_log_path = rf'{get_param_in_cwd("model_root")}/{model}/valid/Epoch-{sm[model]}.txt'
val_log = pd.read_csv(val_log_path, names=['fname', 'prob', 'pred', 'gt'], sep='\t')
val_log['pred'] = val_log['pred'].map(lambda x: map2n(x))

log = pd.concat([train_log, val_log], axis=0)
log['prob'] = list(map(lambda x: x[0] if x[1] == 1 else 1-x[0], np.array(log[['prob', 'pred']])))
log['prob'] = log['prob'].round(decimals=2)
log[['group']] = log[['fname']].applymap(lambda x:os.path.basename(os.path.dirname(x)))
log

In [None]:
log['pred'].value_counts()

### 直方图

```python
def key2histogram(data: pd.DataFrame, group_column: str, histo_columns: Union[str, List[str]],
                  histo_lists: Union[list, List[list]] = None, default_value=0, norm: bool = False):
    """
    所有的数据生成直方图特征， 多个histo_columns存在是，所有的特征进行横向拼接。
    Args:
        data: 数据
        group_column: 样本分组的列明，一般为ID
        histo_columns: 用来计算直方图的列，如果为多列，则每列计算完直方图，然后特征拼接
        histo_lists: None或者与histo_columns个数相同，为自己指定特征列表
        default_value: 不存在特征时的默认值
        norm: 要不要归一化。
    Returns:

    """
```

In [None]:
import os

os.makedirs('features', exist_ok=True)
results = key2.key2histogram(log, group_column='group',histo_columns='prob', norm=True)
results.to_csv('features/path_prob_histogram.csv', header=True, index=False)
display(results)

results = key2.key2histogram(log, group_column='group',histo_columns='pred', norm=True)
results.to_csv('features/path_pred_histogram.csv', header=True, index=False)
display(results)

### TF-IDF

```python
def key2tfidf(data: pd.DataFrame, group_column: str, corpus_columns: Union[str, List[str]]):
    """
    所有的数据生成直方图特征， 多个corpus_columns存在是，所有的特征进行横向拼接。
    Args:
        data: 数据
        group_column: 样本分组的列明，一般为ID
        corpus_columns: 用来计算作为语料的列明。
    Returns:

    """
```

In [None]:
results = key2.key2tfidf(log, group_column='group',corpus_columns='prob')
results.to_csv('features/path_prob_tfidf.csv', header=True, index=False)
display(results)

results = key2.key2tfidf(log, group_column='group',corpus_columns='pred')
results.to_csv('features/path_pred_tfidf.csv', header=True, index=False)
display(results)