## 统计分析

通过指定统计分析字段，得到每个特征的p_value，所有的p_value计算都是基于Ttest计算。支持指定不同的分组`group`，例如train、val、test等分组统计。

对于两大类不同的特征

1. 离散特征，统计数量以及占比。
2. 连续特征，统计均值、方差。

In [16]:
import pandas as pd
import numpy as np
from onekey_algo import OnekeyDS as okds
from onekey_algo import get_param_in_cwd
from onekey_algo.custom.utils import print_join_info

task = get_param_in_cwd('task_column') or 'label'
p_value = get_param_in_cwd('p_value') or 0.05
# 修改成自己临床数据的文件。
test_data = pd.read_csv(get_param_in_cwd('clinic_file')).drop_duplicates('ID')
stats_columns_settings = get_param_in_cwd('stats_columns')
continuous_columns_settings = get_param_in_cwd('continuous_columns')
mapping_columns_settings = get_param_in_cwd('mapping_columns')
test_data = test_data[[c for c in test_data.columns if c != task]]
test_data['ID'] = test_data['ID'].map(lambda x: f"{x}.nii.gz" if not (f"{x}".endswith('.nii.gz') or  f"{x}".endswith('.nii')) else x)
group_info = pd.read_csv(get_param_in_cwd('label_file')).drop_duplicates('ID')

print_join_info(test_data, group_info)
test_data = pd.merge(test_data, group_info, on='ID', how='inner')
test_data



Unnamed: 0,ID,age at surgery,sex,height,weight,BMI,SMA,SMI,sarcopenia,clinical stage,...,Preoperative serum CEA,Surgical approach,Neoadjuvant therapy,Adjuvant chemotherapy,Adjuvant radiotherapy,pN stage,pT stage,lymphovascular invasion,perineural invasion,group
0,primary-1.nii.gz,64,male,1.65,80.0,29.385,161.3,59.247,0,Ⅱ,...,<5,laraoscope,0,0,0,0,3,0,0,train
1,primary-3.nii.gz,56,male,1.78,80.0,25.249,179.4,56.622,0,Ⅲ,...,<5,laraoscope,1,1,0,2,3,0,0,train
2,primary-4.nii.gz,56,male,1.7,61.0,21.107,134.2,46.436,0,Ⅱ,...,≥5,laraoscope,1,1,0,0,2,0,1,train
3,primary-6.nii.gz,67,male,1.8,76.0,23.457,169.4,52.284,0,Ⅱ,...,<5,laraoscope,0,1,1,1,3,0,0,val
4,primary-7.nii.gz,68,male,1.78,85.0,26.827,160.7,50.72,0,Ⅱ,...,<5,laraoscope,0,1,1,0,3,1,0,val
5,primary-10.nii.gz,63,male,1.78,74.0,23.356,144.8,45.701,0,Ⅱ,...,<5,laraoscope,1,1,0,0,2,0,0,train
6,primary-11.nii.gz,49,female,1.65,57.0,20.937,85.9,31.552,1,Ⅲ,...,<5,laraoscope,1,0,0,0,0,0,0,val
7,primary-12.nii.gz,79,female,1.55,40.0,16.649,61.5,25.598,1,Ⅱ,...,≥5,laraoscope,0,0,0,0,3,0,0,train
8,primary-13.nii.gz,63,male,1.75,82.0,26.776,151.5,49.469,0,Ⅱ,...,≥5,laraoscope,0,1,0,0,3,0,0,val
9,primary-14.nii.gz,68,male,1.7,71.0,24.567,155.9,53.945,0,Ⅱ,...,≥5,laraoscope,0,0,0,0,3,0,0,train


In [17]:
test_data['group'].value_counts()

train    268
val      115
test      75
Name: group, dtype: int64

# 特征名称处理

去掉所有特征名称中的特殊字符。

In [18]:
import re

def map_cnames(x):
    x = re.split('[（|(]', x)[0]
    x = x.replace('-', '_').replace(' ', '_').replace('>', '').replace('/', '_').replace('+', '_')
    return x.strip('_')

test_data.columns = list(map(map_cnames, test_data.columns))
test_data.columns

Index(['ID', 'age_at_surgery', 'sex', 'height', 'weight', 'BMI', 'SMA', 'SMI',
       'sarcopenia', 'clinical_stage', 'clinical_T_stage', 'Lymph_node_status',
       'location', 'Preoperative_serum_CEA', 'Surgical_approach',
       'Neoadjuvant_therapy', 'Adjuvant_chemotherapy', 'Adjuvant_radiotherapy',
       'pN_stage', 'pT_stage', 'lymphovascular_invasion',
       'perineural_invasion', 'group'],
      dtype='object')

# 分析数据

获取待分析的特征列名，如未制定，自动侦测。

In [19]:
dc = []

stats_columns = [c for c in stats_columns_settings or list(test_data.columns[1:-1]) if c not in dc]
test_data = test_data.copy()[['ID'] + stats_columns + ['group']]
test_data#['group'].value_counts()

Unnamed: 0,ID,age_at_surgery,sex,height,weight,BMI,SMA,SMI,sarcopenia,clinical_stage,...,Preoperative_serum_CEA,Surgical_approach,Neoadjuvant_therapy,Adjuvant_chemotherapy,Adjuvant_radiotherapy,pN_stage,pT_stage,lymphovascular_invasion,perineural_invasion,group
0,primary-1.nii.gz,64,male,1.65,80.0,29.385,161.3,59.247,0,Ⅱ,...,<5,laraoscope,0,0,0,0,3,0,0,train
1,primary-3.nii.gz,56,male,1.78,80.0,25.249,179.4,56.622,0,Ⅲ,...,<5,laraoscope,1,1,0,2,3,0,0,train
2,primary-4.nii.gz,56,male,1.7,61.0,21.107,134.2,46.436,0,Ⅱ,...,≥5,laraoscope,1,1,0,0,2,0,1,train
3,primary-6.nii.gz,67,male,1.8,76.0,23.457,169.4,52.284,0,Ⅱ,...,<5,laraoscope,0,1,1,1,3,0,0,val
4,primary-7.nii.gz,68,male,1.78,85.0,26.827,160.7,50.72,0,Ⅱ,...,<5,laraoscope,0,1,1,0,3,1,0,val
5,primary-10.nii.gz,63,male,1.78,74.0,23.356,144.8,45.701,0,Ⅱ,...,<5,laraoscope,1,1,0,0,2,0,0,train
6,primary-11.nii.gz,49,female,1.65,57.0,20.937,85.9,31.552,1,Ⅲ,...,<5,laraoscope,1,0,0,0,0,0,0,val
7,primary-12.nii.gz,79,female,1.55,40.0,16.649,61.5,25.598,1,Ⅱ,...,≥5,laraoscope,0,0,0,0,3,0,0,train
8,primary-13.nii.gz,63,male,1.75,82.0,26.776,151.5,49.469,0,Ⅱ,...,≥5,laraoscope,0,1,0,0,3,0,0,val
9,primary-14.nii.gz,68,male,1.7,71.0,24.567,155.9,53.945,0,Ⅱ,...,≥5,laraoscope,0,0,0,0,3,0,0,train


# 特征队列映射

所有需要进行特征映射的队列，range未制定，可以进行自动判断。

In [20]:
mapping_columns = mapping_columns_settings or [c for c in test_data.columns[1:-2] if test_data[c].dtype == object]
mapping_columns

['sex',
 'clinical_stage',
 'location',
 'Preoperative_serum_CEA',
 'Surgical_approach']

# 数据映射

针对所有非数值形式的数据，可以进行类别映射。

In [21]:
from onekey_algo.custom.utils import map2numerical

data, mapping = map2numerical(test_data, mapping_columns=mapping_columns)
mapping

{'sex': {'female': 0, 'male': 1},
 'clinical_stage': {'Ⅱ': 0, 'Ⅲ': 1},
 'location': {'5-10cm': 0, '<5cm': 1, '>10cm': 2},
 'Preoperative_serum_CEA': {'<5': 0, '≥5': 1},
 'Surgical_approach': {'laraoscope': 0, 'open': 1}}

In [22]:
data.dtypes

ID                          object
age_at_surgery               int64
sex                          int64
height                     float64
weight                     float64
BMI                        float64
SMA                        float64
SMI                        float64
sarcopenia                   int64
clinical_stage               int64
clinical_T_stage             int64
Lymph_node_status            int64
location                     int64
Preoperative_serum_CEA       int64
Surgical_approach            int64
Neoadjuvant_therapy          int64
Adjuvant_chemotherapy        int64
Adjuvant_radiotherapy        int64
pN_stage                     int64
pT_stage                     int64
lymphovascular_invasion      int64
perineural_invasion          int64
group                       object
dtype: object

# 连续特征列

自动识别所有可能的连续特征列。如果列不是整数，或者列的元素超过5个，则呗认定为连续特征。

In [23]:
from onekey_algo.custom.components.comp1 import fillna

test_data = fillna(test_data, fill_mod='50%')
continuous_columns = []
for col in test_data.columns:
    if test_data[col].apply(lambda x: x.is_integer() if isinstance(x, float) else False).all():
        test_data[col] = test_data[col].astype(int)

for c in stats_columns:
#     print(c, np.unique(test_data[c]), test_data[c].dtype)
    if len(np.unique(test_data[c])) > 8 or not np.int8 <= test_data[c].dtype <= np.int64:
        continuous_columns.append(c)
        
continuous_columns = continuous_columns_settings or continuous_columns
continuous_columns = [c for c in continuous_columns if c not in ('differentation')]
continuous_columns

['age_at_surgery', 'height', 'weight', 'BMI', 'SMA', 'SMI']

# 缺失值填充

In [24]:
import os
os.makedirs('data', exist_ok=True)
data = test_data
data.to_csv('data/clinical.csv', index=False)
data

Unnamed: 0,ID,age_at_surgery,sex,height,weight,BMI,SMA,SMI,sarcopenia,clinical_stage,...,Preoperative_serum_CEA,Surgical_approach,Neoadjuvant_therapy,Adjuvant_chemotherapy,Adjuvant_radiotherapy,pN_stage,pT_stage,lymphovascular_invasion,perineural_invasion,group
0,primary-1.nii.gz,64,1,1.65,80.0,29.385,161.3,59.247,0,0,...,0,0,0,0,0,0,3,0,0,train
1,primary-3.nii.gz,56,1,1.78,80.0,25.249,179.4,56.622,0,1,...,0,0,1,1,0,2,3,0,0,train
2,primary-4.nii.gz,56,1,1.7,61.0,21.107,134.2,46.436,0,0,...,1,0,1,1,0,0,2,0,1,train
3,primary-6.nii.gz,67,1,1.8,76.0,23.457,169.4,52.284,0,0,...,0,0,0,1,1,1,3,0,0,val
4,primary-7.nii.gz,68,1,1.78,85.0,26.827,160.7,50.72,0,0,...,0,0,0,1,1,0,3,1,0,val
5,primary-10.nii.gz,63,1,1.78,74.0,23.356,144.8,45.701,0,0,...,0,0,1,1,0,0,2,0,0,train
6,primary-11.nii.gz,49,0,1.65,57.0,20.937,85.9,31.552,1,1,...,0,0,1,0,0,0,0,0,0,val
7,primary-12.nii.gz,79,0,1.55,40.0,16.649,61.5,25.598,1,0,...,1,0,0,0,0,0,3,0,0,train
8,primary-13.nii.gz,63,1,1.75,82.0,26.776,151.5,49.469,0,0,...,1,0,0,1,0,0,3,0,0,val
9,primary-14.nii.gz,68,1,1.7,71.0,24.567,155.9,53.945,0,0,...,1,0,0,0,0,0,3,0,0,train


### 统计分析

支持两种格式数据，分别对应`pretty`参数的`True`和`False`, 当为`True`时，输出的是表格模式，反之则为dict数据。

```python
def clinic_stats(data: DataFrame, stats_columns: Union[str, List[str]], label_column='label',
                 group_column: str = None, continuous_columns: Union[str, List[str]] = None,
                 pretty: bool = True) -> Union[dict, DataFrame]:
    """

    Args:
        data: 数据
        stats_columns: 需要统计的列名
        label_column: 二分类的标签列，默认`label`
        group_column: 分组统计依据，例如区分训练组、测试组、验证组。
        continuous_columns: 那些列是连续变量，连续变量统计均值方差。
        pretty: bool, 是否对结果进行格式美化。

    Returns:
        stats DataFrame or json

    """
```

In [25]:
from onekey_algo.custom.components.stats import clinic_stats

pd.set_option('display.max_rows', None)
stats_train_val = clinic_stats(data[data['group'].isin(['train', 'val'])], 
                               stats_columns= stats_columns,
                               label_column='group', 
                               group_column=None, 
                               continuous_columns= continuous_columns, 
                               pretty=True, verbose=False).reset_index(drop=True)
stats_train_val

Unnamed: 0,feature_name,-label=ALL,-label=train,-label=val,pvalue
0,age_at_surgery,61.07±11.17,60.72±11.43,61.89±10.56,0.472
1,height,1.68±0.08,1.67±0.08,1.69±0.07,0.247
2,weight,66.81±10.85,66.37±10.61,67.84±11.36,0.285
3,BMI,23.70±3.27,23.65±3.24,23.83±3.35,0.787
4,SMA,129.10±30.43,127.65±29.92,132.47±31.46,0.187
5,SMI,45.53±8.89,45.21±8.78,46.28±9.13,0.279
6,sex,,,,0.374
7,0,124(32.38),91(33.96),33(28.70),
8,1,259(67.62),177(66.04),82(71.30),
9,sarcopenia,,,,0.435


In [26]:
from onekey_algo.custom.components.stats import clinic_stats

pd.set_option('display.max_rows', None)
stats = clinic_stats(data, 
                     stats_columns= stats_columns,
                     label_column='group', 
                     group_column=None, 
                     continuous_columns= continuous_columns, 
                     pretty=True, verbose=False)
# display(stats)
stats_train_val['test'] = stats['-label=test']
stats_train_val.to_csv('data/stats_sep.csv', index=False, encoding='utf_8_sig')
stats_train_val

Unnamed: 0,feature_name,-label=ALL,-label=train,-label=val,pvalue,test
0,age_at_surgery,61.07±11.17,60.72±11.43,61.89±10.56,0.472,63.91±9.28
1,height,1.68±0.08,1.67±0.08,1.69±0.07,0.247,1.67±0.08
2,weight,66.81±10.85,66.37±10.61,67.84±11.36,0.285,65.77±9.59
3,BMI,23.70±3.27,23.65±3.24,23.83±3.35,0.787,23.50±3.10
4,SMA,129.10±30.43,127.65±29.92,132.47±31.46,0.187,129.71±25.60
5,SMI,45.53±8.89,45.21±8.78,46.28±9.13,0.279,46.16±8.18
6,sex,,,,0.374,
7,0,124(32.38),91(33.96),33(28.70),,19(25.33)
8,1,259(67.62),177(66.04),82(71.30),,56(74.67)
9,sarcopenia,,,,0.435,
