In [1]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

In [2]:
path_text = './data/data_regression.xlsx'
path_series = './data/data_time_series_feature_select_pro.xlsx'
df_text = pd.read_excel(path_text)
df_series = pd.read_excel(path_series)
print(df_text.shape)
print(df_series.shape)

(3629, 19)
(3583, 28)


In [3]:
# 加载word2vec模型
wv_model_CHI = Word2Vec.load('./word2vec_model/wv_model_CHI.model')
wv_model_IG = Word2Vec.load('./word2vec_model/wv_model_IG.model')
wv_model_MI = Word2Vec.load('./word2vec_model/wv_model_MI.model')

In [4]:
# 加载tf_idf模型
TF_IDF = np.load('./tf_idf/tf_idf.npy',allow_pickle=True).item()

In [5]:
# 病区编码
coder = {
    '口腔科病区':1,
    '妇产科病区':2,
    '心血管病区':3,
    '整形外科病区':4,
    '普通外科病区':5,
    '普通胸外科病区':6,
    '泌尿外科中心病区':7,
    '眼科病区':8,
    '神经内科病区':9,
    '神经外科病区':10,
    '耳鼻咽喉科病区':11,
    '肝胆外科病区':12,
    '骨科病区':13
}

def code_department(s,coder):
    return coder[s]

# 根据某一种特征选择方法 处理文本df
def handle_text(df_text,feature_method):
    df = df_text.filter(['手术ID','性别','年龄','体重','身高','病区','实施手术_'+feature_method,'术前诊断_'+feature_method]).copy()
    print('输入的文本df:{}'.format(df.shape))
    # 去除空值
    df.dropna(how='all',subset=['实施手术_'+feature_method,'术前诊断_'+feature_method],inplace=True)
    df.fillna(value='',inplace=True)
    # 处理一些数据
    df['文本_'+feature_method] = df['实施手术_'+feature_method] + '|' + df['术前诊断_'+feature_method]
    df['BMI'] = df['体重'] / (df['身高'] / 100) ** 2
    dummies_sex = pd.get_dummies(df['性别'],prefix='sex')
    df = pd.concat([df,dummies_sex],axis=1)
    df['病区'] = df['病区'].apply(code_department,args=(coder,))
    # 去除不需要的行
    df.drop(['实施手术_'+feature_method,'术前诊断_'+feature_method,'体重','身高','性别'],axis=1,inplace=True)
    print('删除后的文本df:{}'.format(df.shape))
    return df

# 根据是否进行插值,处理时间序列的df
def handle_series(df_series,isInterpolation):
    cols = ['手术ID']
    if isInterpolation: #进行了插值处理
        cols.extend(['NBPDIA_inter_feature','NBPSYS_inter_feature','PR_inter_feature','SPO2_inter_feature'])
    else:
        cols.extend(['NBPDIA_feature','NBPSYS_feature','PR_feature','SPO2_feature'])
    df = df_series.filter(cols).copy()
    print('处理后的时间序列df:{}'.format(df.shape))
    return df


In [7]:

    



# 生成numpy特征矩阵
def generate_np(handled_text,handled_series):
    print('handled_text:{}'.format(handled_text.shape))
    print('handled_series:{}'.format(handled_series.shape))
    df = pd.merge(handled_text,handled_series,on='手术ID',how='inner')
    print('合并后:{}'.format(df.shape))
    # 结构化特征
    np_strud = df.filter(['年龄','BMI','sex_男','sex_女']).values
    print('np_strud:{}'.format(np_strud.shape))
    # 文本特征
    np_text = np.array([])
    feature_method = df.columns[3].split('_')[1]
    wv_model = None
    if feature_method == 'CHI':
        wv_model = wv_model_CHI
    if feature_method == 'IG':
        wv_model = wv_model_IG
    if feature_method == 'MI':
        wv_model = wv_model_MI
    operation_ids = list(df['手术ID'])
    for text,operation_id in zip(df['文本_'+feature_method],operation_ids):
        tmp = np.zeros(50)
        for w in text.split('|'):
            if w != '':
                tmp = tmp + TF_IDF[operation_id][w] * wv_model[w]
        if np_text.shape[0] == 0:
            np_text = tmp
        else:
            np_text = np.vstack((np_text,tmp))
    print('np_text:{}'.format(np_text.shape))
    # 时间序列特征
    series_val = [] 
    is_inter = 'inter' in df.columns[-1]
    cols = None
    df_series = None
    if is_inter:
        df_series = df['NBPDIA_inter_feature']+' '+df['NBPSYS_inter_feature']+' '+df['PR_inter_feature']+' '+df['SPO2_inter_feature']
    else:
        df_series = df['NBPDIA_feature']+' '+df['NBPSYS_feature']+' '+df['PR_feature']+' '+df['SPO2_feature']
    df_series.apply(lambda s:series_val.append([float(c) for c in s.split(' ')]))
    np_series = np.array(series_val)
    print('np_series:{}'.format(np_series.shape))
    # 病区特征
    np_y = df['病区'].values.reshape(-1,1)
    print('np_y:{}'.format(np_y.shape))
    # 合并
    return np.hstack((np_strud,np_text,np_series,np_y))

In [8]:
feature_methods = ['CHI','IG','MI']
for feature_method in feature_methods:
    handled_text = handle_text(df_text,feature_method)
    for is_inter in [True,False]:
        handled_series = handle_series(df_series,is_inter)
        np_data = generate_np(handled_text,handled_series)
        np.save('./np_cluster_pro/np_{}_{}.npy'.format(feature_method,str(is_inter)),np_data)

输入的文本df:(3629, 8)
删除后的文本df:(3629, 7)
处理后的时间序列df:(3583, 5)
handled_text:(3629, 7)
handled_series:(3583, 5)
合并后:(3583, 11)
np_strud:(3583, 4)




np_text:(3583, 50)
np_series:(3583, 40)
np_y:(3583, 1)
处理后的时间序列df:(3583, 5)
handled_text:(3629, 7)
handled_series:(3583, 5)
合并后:(3583, 11)
np_strud:(3583, 4)
np_text:(3583, 50)
np_series:(3583, 40)
np_y:(3583, 1)
输入的文本df:(3629, 8)
删除后的文本df:(3629, 7)
处理后的时间序列df:(3583, 5)
handled_text:(3629, 7)
handled_series:(3583, 5)
合并后:(3583, 11)
np_strud:(3583, 4)
np_text:(3583, 50)
np_series:(3583, 40)
np_y:(3583, 1)
处理后的时间序列df:(3583, 5)
handled_text:(3629, 7)
handled_series:(3583, 5)
合并后:(3583, 11)
np_strud:(3583, 4)
np_text:(3583, 50)
np_series:(3583, 40)
np_y:(3583, 1)
输入的文本df:(3629, 8)
删除后的文本df:(3245, 7)
处理后的时间序列df:(3583, 5)
handled_text:(3245, 7)
handled_series:(3583, 5)
合并后:(3205, 11)
np_strud:(3205, 4)
np_text:(3205, 50)
np_series:(3205, 40)
np_y:(3205, 1)
处理后的时间序列df:(3583, 5)
handled_text:(3245, 7)
handled_series:(3583, 5)
合并后:(3205, 11)
np_strud:(3205, 4)
np_text:(3205, 50)
np_series:(3205, 40)
np_y:(3205, 1)


In [9]:
np_t = np.load('./np_cluster_pro/np_CHI_False.npy')

In [10]:
np_t[:,0:4] 

array([[36.        , 25.1486054 ,  0.        ,  1.        ],
       [51.        , 22.65625   ,  0.        ,  1.        ],
       [66.        , 22.50930904,  0.        ,  1.        ],
       ...,
       [34.        , 24.00548697,  0.        ,  1.        ],
       [29.        , 25.5588462 ,  0.        ,  1.        ],
       [29.        , 30.078125  ,  0.        ,  1.        ]])

In [11]:
np_t[:,4:54]

array([[ -3.91540924,   6.39076814, -10.60575433, ...,  -4.85250763,
         28.59868014,  -5.80400626],
       [ 20.98289454,  -6.42012958, -17.16561925, ...,   8.91219554,
          5.16832945,  -2.20582636],
       [ 32.52614024, -14.33920735, -15.76525956, ...,  17.23539725,
         20.7365731 ,   2.14595723],
       ...,
       [  3.81526968,  -8.26959085, -11.96560616, ...,  16.00285518,
         -5.36332175, -25.96547931],
       [ -7.22212793,   3.02416834, -10.03417998, ...,  -8.85705045,
         24.24621312, -15.96307817],
       [ -1.86003169,  11.37191236, -13.89927968, ..., -10.20666629,
         29.40385485,  -9.20128256]])

In [12]:
np_t[:,54:94]

array([[ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 3.,  4.,  3., ...,  3.,  4.,  4.],
       [ 0.,  0.,  2., ...,  0.,  0.,  2.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [20., 14., 17., ...,  1.,  0.,  4.],
       [24.,  8., 11., ..., 14., 11., 26.]])