In [22]:
import pandas as pd
import math
import numpy as np
from gensim.models import Word2Vec

In [2]:
path = './data/data_regression.xlsx'
df = pd.read_excel(path)
df.shape

(3629, 19)

In [6]:
df_strud = df.filter(['性别','年龄','体重','身高','病区','监测ID']).copy()

In [7]:
df_strud['BMI'] = df_strud['体重'] / (df_strud['身高'] / 100) ** 2

In [8]:
df_strud.head()

Unnamed: 0,性别,年龄,体重,身高,病区,监测ID,BMI
0,女,36,66.0,162.0,妇产科病区,252525,25.148605
1,女,51,58.0,160.0,心血管病区,252556,22.65625
2,女,66,47.0,144.5,心血管病区,252599,22.509309
3,男,47,102.0,171.0,骨科病区,252538,34.882528
4,男,46,55.0,173.0,骨科病区,252616,18.376825


In [9]:
dummies_sex = pd.get_dummies(df_strud['性别'],prefix='sex')
dummies_sex.head()

Unnamed: 0,sex_女,sex_男
0,1,0
1,1,0
2,1,0
3,0,1
4,0,1


In [10]:
dummies_department = pd.get_dummies(df_strud['病区'],prefix='department')
dummies_department.head()

Unnamed: 0,department_口腔科病区,department_妇产科病区,department_心血管病区,department_整形外科病区,department_普通外科病区,department_普通胸外科病区,department_泌尿外科中心病区,department_眼科病区,department_神经内科病区,department_神经外科病区,department_耳鼻咽喉科病区,department_肝胆外科病区,department_骨科病区
0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,1


In [13]:
# 合并所有的pd
print(df_strud.shape[0])
print(dummies_sex.shape[0])
print(dummies_department.shape[0])
df_strud_final = pd.concat([df_strud,dummies_sex,dummies_department],axis=1)
df_strud_final.drop(['性别','病区','身高','体重'],axis=1,inplace=True)

3629
3629
3629


In [14]:
df_strud_final.head()

Unnamed: 0,年龄,监测ID,BMI,sex_女,sex_男,department_口腔科病区,department_妇产科病区,department_心血管病区,department_整形外科病区,department_普通外科病区,department_普通胸外科病区,department_泌尿外科中心病区,department_眼科病区,department_神经内科病区,department_神经外科病区,department_耳鼻咽喉科病区,department_肝胆外科病区,department_骨科病区
0,36,252525,25.148605,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,51,252556,22.65625,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,66,252599,22.509309,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,47,252538,34.882528,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,46,252616,18.376825,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1


In [23]:
wv_model_CHI = Word2Vec.load('./word2vec_model/wv_model_CHI.model')
wv_model_IG = Word2Vec.load('./word2vec_model/wv_model_IG.model')
wv_model_MI = Word2Vec.load('./word2vec_model/wv_model_MI.model')

In [30]:
tf_idf = np.load('./tf_idf/tf_idf.npy',allow_pickle=True).item()

In [112]:
def concat_strud_unstrud_data(df,feature_method):
    df_unstrud = df.filter(['手术ID','手术时间','实施手术_'+feature_method,'术前诊断_'+feature_method])
    df_tmp = pd.concat([df_strud_final,df_unstrud],axis=1)
    print('样本个数：{}'.format(df_tmp.shape[0]))
    # 去重
    df_dup = df_tmp.dropna(how='all',subset=['实施手术_'+feature_method,'术前诊断_'+feature_method]).copy()
    print('去重后的样本个数：{}'.format(df_dup.shape[0]))
    df_dup.fillna(value='',inplace=True)
    return df_dup

# 计算文本向量，使用tfidf加权，返回numpy数组
def computed_text_vector(df_data,wv_model,feature_method,tf_idf):
    ans = np.array([])
    operation_ids = list(df_data['手术ID'])
    for item,operation_id in zip(list(df_data['实施手术_'+feature_method] +'|'+ df_data['术前诊断_'+feature_method]),operation_ids):
        tmp = np.zeros(50)
        for w in item.split('|'):
            if w != '':
                tmp = tmp + tf_idf[operation_id][w] * wv_model.wv[w]
        if ans.shape[0] == 0:
            ans = tmp
        else:
            ans = np.vstack((ans,tmp))
    return ans

def get_np(df_data,text_vector,feature_method):
    np_strud = df_data.drop(['监测ID','手术ID','手术时间','实施手术_'+feature_method,'术前诊断_'+feature_method],axis=1).values
    print('np_strud：{}'.format(np_strud.shape))
    print('text_vector：{}'.format(text_vector.shape))
    np_Y = df_data['手术时间'].values
    np_Y = np_Y.reshape(np_Y.shape[0],1)
    print('np_Y：{}'.format(np_Y.shape))
    return np.hstack((np_strud,text_vector,np_Y))

In [105]:
df_data = concat_strud_unstrud_data(df,'MI')

样本个数：3629
去重后的样本个数：3245


In [106]:
df_data.head(3)

Unnamed: 0,年龄,监测ID,BMI,sex_女,sex_男,department_口腔科病区,department_妇产科病区,department_心血管病区,department_整形外科病区,department_普通外科病区,...,department_眼科病区,department_神经内科病区,department_神经外科病区,department_耳鼻咽喉科病区,department_肝胆外科病区,department_骨科病区,手术ID,手术时间,实施手术_MI,术前诊断_MI
0,36,252525,25.148605,1,0,0,1,0,0,0,...,0,0,0,0,0,0,408485,1.808333,二次|剖宫产术,囊肿
1,51,252556,22.65625,1,0,0,0,1,0,0,...,0,0,0,0,0,0,408486,4.758333,房颤|消融|改良|迷宫术|人工|二尖瓣|置换术,风心病
2,66,252599,22.509309,1,0,0,0,1,0,0,...,0,0,0,0,0,0,408487,6.65,二尖瓣|置换术,心脏|瓣膜|瓣膜|功能障碍


In [107]:
text_vector_MI = computed_text_vector(df_data,wv_model_MI,'MI',tf_idf)

In [108]:
text_vector_MI[0]

array([ 2.23795784, -3.31284903, -2.28265134,  0.78481862, -2.65634191,
       -0.94138849,  3.10738292, -3.2883004 , -0.79307869, -3.56433117,
        0.03160913,  0.67739832,  4.16606188,  3.8992421 ,  0.95108226,
       -4.24030519, -1.82862255,  2.07878932, -2.84403688,  1.35127848,
        2.20980537, -1.01530156, -2.50619477, -1.24228951,  2.83974278,
       -2.49912632,  7.80600643, -3.25398493, -3.42055041,  0.46046546,
       -2.30263114,  3.05323136, -2.647764  , -2.01686535, -2.33434984,
       -1.70201486,  1.16336592, -0.41866767, -0.76265144, -2.83597738,
       -1.83541676, -0.92956935,  0.77379749, -2.0570204 ,  3.76637882,
       -1.49134322, -1.82822407,  0.03545758,  3.02204458, -1.61178719])

In [113]:
np_MI = get_np(df_data,text_vector_MI,'MI')

np_strud：(3245, 17)
text_vector：(3245, 50)
np_Y：(3245, 1)


In [114]:
np_MI.shape

(3245, 68)

In [115]:
np_MI[0]

array([ 3.60000000e+01,  2.51486054e+01,  1.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  2.23795784e+00, -3.31284903e+00, -2.28265134e+00,
        7.84818619e-01, -2.65634191e+00, -9.41388488e-01,  3.10738292e+00,
       -3.28830040e+00, -7.93078691e-01, -3.56433117e+00,  3.16091329e-02,
        6.77398324e-01,  4.16606188e+00,  3.89924210e+00,  9.51082259e-01,
       -4.24030519e+00, -1.82862255e+00,  2.07878932e+00, -2.84403688e+00,
        1.35127848e+00,  2.20980537e+00, -1.01530156e+00, -2.50619477e+00,
       -1.24228951e+00,  2.83974278e+00, -2.49912632e+00,  7.80600643e+00,
       -3.25398493e+00, -3.42055041e+00,  4.60465461e-01, -2.30263114e+00,
        3.05323136e+00, -2.64776400e+00, -2.01686535e+00, -2.33434984e+00,
       -1.70201486e+00,  

In [116]:
np.save('./np_regression/np_MI.npy',np_MI)