## 统计分析

通过指定统计分析字段，得到每个特征的p_value，所有的p_value计算都是基于Ttest计算。支持指定不同的分组`group`，例如train、val、test等分组统计。

对于两大类不同的特征

1. 离散特征，统计数量以及占比。
2. 连续特征，统计均值、方差。

In [None]:
import pandas as pd
import numpy as np
from onekey_algo import OnekeyDS as okds
from onekey_algo import get_param_in_cwd
import re
from functools import reduce
from onekey_algo.custom.utils import print_join_info

def map2volume(v):
    try:
        return reduce(lambda x,y: x*y, [float(i) for i in re.split('[x|X|.|,]', v)])
    except:
        return np.nan

task = 'label'
# 修改成自己临床数据的文件。
test_data = pd.read_csv(get_param_in_cwd('clinic_file'))
test_data = test_data[[c for c in test_data.columns if c != task]]
test_data['ID'] = test_data['ID'].astype(str)

group_info = pd.read_csv('group.csv')
group_info['ID'] = group_info['ID'].astype(str)
print_join_info(test_data, group_info)
test_data = pd.merge(test_data, group_info, on='ID', how='inner')
test_data.columns = ['_'.join(re.split('[(|（|\[]', c)[0].split(' ')).strip('_') for c in test_data.columns]
# test_data['size'] = test_data['size'].map(lambda v:  map2volume(v))
# test_data = test_data.drop(['genetic_mutation'], axis=1)
test_data

In [None]:
test_data.columns = [c.replace('-', '_').replace(' ', '_') for c in test_data.columns]
test_data.columns

In [None]:
stats_columns = list(test_data.columns)[1:-2]
continuous_columns = []
for c in stats_columns:
    if len(np.unique(test_data[c])) > 5:
        continuous_columns.append(c)
        
continuous_columns

# 数据映射

针对所有非数值形式的数据，可以进行类别映射。

In [None]:
from onekey_algo.custom.utils import map2numerical
import json

mapping_columns = [c for c in test_data.columns[1:-2] if test_data[c].dtype == object]
data, mapping = map2numerical(test_data, mapping_columns=mapping_columns)
print(json.dumps(mapping, indent=True, ensure_ascii=False))

# 缺失值填充

In [None]:
from onekey_algo.custom.components.comp1 import fillna
data = fillna(data, fill_mod='50%')
data.to_csv('features/clinical.csv', index=False, encoding='utf-8-sig')
data

### 输出格式
支持两种格式数据，分别对应`pretty`参数的`True`和`False`, 当为`True`时，输出的是表格模式，反之则为dict数据。

```python
def clinic_stats(data: DataFrame, stats_columns: Union[str, List[str]], label_column='label',
                 group_column: str = None, continuous_columns: Union[str, List[str]] = None,
                 pretty: bool = True) -> Union[dict, DataFrame]:
    """

    Args:
        data: 数据
        stats_columns: 需要统计的列名
        label_column: 二分类的标签列，默认`label`
        group_column: 分组统计依据，例如区分训练组、测试组、验证组。
        continuous_columns: 那些列是连续变量，连续变量统计均值方差。
        pretty: bool, 是否对结果进行格式美化。

    Returns:
        stats DataFrame or json

    """
```

In [None]:
from onekey_algo.custom.components.stats import clinic_stats
import pandas as pd
pd.set_option('max_rows', None)
stats = clinic_stats(data, 
                     stats_columns= stats_columns,
                     label_column=task, 
                     group_column='group', 
                     continuous_columns= continuous_columns, 
                     pretty=True, verbose=False)
stats.to_csv('stats.csv', index=False, encoding='utf_8_sig')
stats

In [None]:
sel_idx = [True if (isinstance(pv[0], str) and pv[0] != '') or (isinstance(pv[0], float) and pv[0] < 0.05) else False 
           for pv in np.array(stats['pvalue'])]
data[['ID'] + list(stats[sel_idx]['feature_name']) + ['group', 'label']].to_csv('clinic_sel.csv', index=False)

In [None]:
from onekey_algo.custom.components.comp1 import uni_multi_variable_analysis                        

p_value=0.05
uni_multi_variable_analysis(data[data['group'] == 'train'], stats_columns, save_dir='img', p_value4multi=p_value, hazard_ratios=True)

In [None]:
uni_v = pd.read_csv('img/multivariable_reg.csv')
uni_v = uni_v[uni_v['p_value'] < .05]
sel_data = data[['ID'] + list(uni_v['feature_name']) + ['group', 'label']]
sel_data.to_csv('clinic_sel.csv', index=False)
sel_data

In [None]:
uni = pd.read_csv('img/univariable_reg.csv')
uni = uni[[c for c in uni if ('OR' in c and 'Log' not in c) or c in ['feature_name', 'p_value']]]
multi = pd.read_csv('img/multivariable_reg.csv')
multi = multi[[c for c in uni if ('OR' in c and 'Log' not in c) or c in ['feature_name', 'p_value']]]
pd.merge(uni, multi,
         on='feature_name', how='left', suffixes=['_UNI', '_MULTI']).applymap(lambda x: '' if pd.isna(x) else x)