## 统计分析

通过指定统计分析字段，得到每个特征的p_value，所有的p_value计算都是基于Ttest计算。支持指定不同的分组`group`，例如train、val、test等分组统计。

对于两大类不同的特征

1. 离散特征，统计数量以及占比。
2. 连续特征，统计均值、方差。

In [1]:
import pandas as pd
import numpy as np
from onekey_algo import OnekeyDS as okds
from onekey_algo import get_param_in_cwd
from onekey_algo.custom.utils import print_join_info

task = get_param_in_cwd('task_column') or 'label'
p_value = get_param_in_cwd('p_value', 0.05)
# 修改成自己临床数据的文件。
test_data = pd.read_csv(get_param_in_cwd('clinic_file'), dtype={'ID': str})
stats_columns = get_param_in_cwd('stats_columns')
continuous_columns = get_param_in_cwd('continuous_columns')
test_data = test_data[[c for c in test_data.columns if c not in  [task]]]
group_info = pd.read_csv(get_param_in_cwd('label_file'), dtype={'ID': str}).drop_duplicates('ID')
print_join_info(test_data, group_info)
test_data = pd.merge(test_data, group_info, on='ID', how='inner')
test_data



Unnamed: 0,ID,age,gender,T,N,M,Lauren classification,Location,OS,OSTime,group
0,TCGA-BR-8368,84,0,1,0,0,0,0,0,131,test
1,TCGA-D7-8575,75,1,2,2,0,0,0,1,554,train
2,TCGA-VQ-A925,66,1,2,2,0,0,1,1,138,train
3,TCGA-RD-A7BT,66,1,2,3,0,0,0,1,262,train
4,TCGA-CD-A4MG,76,1,2,0,0,1,0,1,200,train
...,...,...,...,...,...,...,...,...,...,...,...
160,TCGA-RD-A8N6,78,0,1,2,0,0,1,1,272,train
161,TCGA-HU-A4GN,61,1,1,1,0,0,1,0,912,train
162,TCGA-BR-8484,61,1,3,1,0,0,0,1,766,test
163,TCGA-RD-A8NB,80,0,2,1,0,1,0,1,513,train


In [2]:
test_data['group'].value_counts()

train    115
test      50
Name: group, dtype: int64

In [3]:
def map_cnames(x):
    x = x.replace('-', '_').replace(' ', '_').replace('>', '').replace('/', '_')
    return x

test_data.columns = list(map(map_cnames, test_data.columns))
test_data.columns

Index(['ID', 'age', 'gender', 'T', 'N', 'M', 'Lauren_classification',
       'Location', 'OS', 'OSTime', 'group'],
      dtype='object')

In [4]:
mapping_columns = [c for c in test_data.columns[1:-2] if test_data[c].dtype == object]
mapping_columns

[]

# 数据映射

针对所有非数值形式的数据，可以进行类别映射。

In [5]:
from onekey_algo.custom.utils import map2numerical

data, mapping = map2numerical(test_data, mapping_columns=mapping_columns)
mapping

{}

In [7]:
stats_columns = [c for c in test_data.columns if c not in ['ID', 'label', 'group', 'OS', 'OSTime']]
continuous_columns = []
for c in stats_columns:
    if len(np.unique(test_data[c])) > 6 or not np.int8 <= test_data[c].dtype <= np.int64:
        continuous_columns.append(c)
        
continuous_columns

['age']

# 缺失值填充

In [8]:
import os
from onekey_algo.custom.components.comp1 import fillna
os.makedirs('data', exist_ok=True)
data = fillna(data, fill_mod='50%')
data.to_csv('data/clinical.csv', index=False)
data

Unnamed: 0,ID,age,gender,T,N,M,Lauren_classification,Location,OS,OSTime,group
0,TCGA-BR-8368,84,0,1,0,0,0,0,0,131,test
1,TCGA-D7-8575,75,1,2,2,0,0,0,1,554,train
2,TCGA-VQ-A925,66,1,2,2,0,0,1,1,138,train
3,TCGA-RD-A7BT,66,1,2,3,0,0,0,1,262,train
4,TCGA-CD-A4MG,76,1,2,0,0,1,0,1,200,train
...,...,...,...,...,...,...,...,...,...,...,...
160,TCGA-RD-A8N6,78,0,1,2,0,0,1,1,272,train
161,TCGA-HU-A4GN,61,1,1,1,0,0,1,0,912,train
162,TCGA-BR-8484,61,1,3,1,0,0,0,1,766,test
163,TCGA-RD-A8NB,80,0,2,1,0,1,0,1,513,train


### 输出格式
支持两种格式数据，分别对应`pretty`参数的`True`和`False`, 当为`True`时，输出的是表格模式，反之则为dict数据。

```python
def clinic_stats(data: DataFrame, stats_columns: Union[str, List[str]], label_column='label',
                 group_column: str = None, continuous_columns: Union[str, List[str]] = None,
                 pretty: bool = True) -> Union[dict, DataFrame]:
    """

    Args:
        data: 数据
        stats_columns: 需要统计的列名
        label_column: 二分类的标签列，默认`label`
        group_column: 分组统计依据，例如区分训练组、测试组、验证组。
        continuous_columns: 那些列是连续变量，连续变量统计均值方差。
        pretty: bool, 是否对结果进行格式美化。

    Returns:
        stats DataFrame or json

    """
```

In [9]:
from onekey_algo.custom.components.stats import clinic_stats

pd.set_option('display.max_rows', None)
stats = clinic_stats(data, 
                     stats_columns= stats_columns,
                     label_column='group', 
                     group_column=None, 
                     continuous_columns= continuous_columns, 
                     pretty=True, verbose=False)
stats.to_csv(f'stats.csv', index=False, encoding='utf_8_sig')
display(stats)

Unnamed: 0,feature_name,-label=ALL,-label=test,-label=train,pvalue
0,age,66.75±9.87,68.36±10.16,66.04±9.70,0.078
0,gender,,,,1.0
1,0,64(38.79),19(38.00),45(39.13),
2,1,101(61.21),31(62.00),70(60.87),
3,T,,,,0.138
4,0,5(3.03),3(6.00),2(1.74),
5,1,33(20.00),14(28.00),19(16.52),
6,2,85(51.52),22(44.00),63(54.78),
7,3,42(25.45),11(22.00),31(26.96),
8,N,,,,0.653
