In [1]:
import sys
import cPickle as pickle
from datetime import datetime
import numpy as np
import random
import math

In [2]:
# icd-9编码处理

# 还原icd，保留最后两位小数点
def convert_to_icd9(dxStr):
    if dxStr.startswith('E'):
        if len(dxStr) > 4:
            return dxStr[:4] + '.' + dxStr[4:]
        else:
            return dxStr
    else:
        if len(dxStr) > 3:
            return dxStr[:3] + '.' + dxStr[3:]
        else:
            return dxStr
        
# 只取前三位数据，不保留小数点
def convert_to_3digit_icd9(dxStr):
    if dxStr.startswith('E'):
        if len(dxStr) > 4:
            return dxStr[:4]
        else:
            return dxStr
    else:
        if len(dxStr) > 3:
            return dxStr[:3]
        else:
            return dxStr

In [3]:
dataFile = 'MIMICIII'
outFile = 'SRA'

In [5]:
# pidAdmMap映射pid到admid，admDateMap映射admid到admtime
print 'Building pid-admission mapping, pid-death mapping, admission-date mapping'
all_death_num=0
pidAdmMap = {}
admDateMap = {}
pidDeathMap= {}
infd = open(dataFile+'/ADMISSIONS.csv', 'r')
infd.readline()
for line in infd:
    tokens = line.strip().split(',')
    pid = int(tokens[1])
    admId = int(tokens[2])
    admTime = datetime.strptime(tokens[3], '%Y-%m-%d %H:%M:%S')
    admDateMap[admId] = admTime
    if pid in pidAdmMap:
        pidAdmMap[pid].append(admId)
    else:
        pidAdmMap[pid] = [admId]
    if tokens[5] != '':
        pidDeathMap[pid]=1
        all_death_num+=1
    else:
        if pid not in pidDeathMap:
            pidDeathMap[pid]=0      
infd.close()

Building pid-admission mapping, pid-death mapping, admission-date mapping


In [7]:
# pidGenBirthMap人的基本信息（性别、年龄、婚姻、人种、宗教），先使用性别和年龄
print 'Building patient information'
pidGenBirthMap={}
infd = open(dataFile+'/PATIENTS.csv', 'r')
infd.readline();
for line in infd:
    tokens = line.strip().split(',')
    pid = int(tokens[1])
    if tokens[2]=='F':gender = 0
    else: gender = 1
    birth = datetime.strptime(tokens[3], '%Y-%m-%d %H:%M:%S')
    pidGenBirthMap[pid] = [gender,birth]
infd.close()

Building patient information


In [10]:
# 创建admission-dxList mapping，admDxMap映射admid到在这个住院中的诊断的icd9码
print 'Building admission-dxList mapping'
admDxMap = {}
admDxMap_3digit = {}
dx_icd={} #icd和dx的映射
dx_icd2={} #前面是dx码，后面是icd码
infd = open(dataFile+'/DIAGNOSES_ICD.csv', 'r')
infd.readline()
raw_all_codes_num=0
for line in infd:
    raw_all_codes_num+=1
    tokens = line.strip().split(',')
    admId = int(tokens[2])
    dxStr = 'D_' + convert_to_icd9(tokens[4])
    dxStr_3digit = 'D_' + convert_to_3digit_icd9(tokens[4])
    
    if admId in admDxMap:
        admDxMap[admId].append(dxStr)
    else:
        admDxMap[admId] = [dxStr]

    if admId in admDxMap_3digit:
        admDxMap_3digit[admId].append(dxStr_3digit)
    else:
        admDxMap_3digit[admId] = [dxStr_3digit]
        
    if dxStr in dx_icd:
        continue
    else:
        dx_icd[dxStr]=tokens[4]
        dx_icd2[tokens[4]]=dxStr
infd.close()

Building admission-dxList mapping


In [23]:
# sortedList按照时间储存了[时间，icd9诊断]对，pidSeqMap存入每个病人的sortedList
print 'Building pid-sortedVisits mapping'
pidSeqMap = {}
pidSeqMap_3digit = {}
for pid, admIdList in pidAdmMap.iteritems():
    sortedList = sorted([(admDateMap[admId], admDxMap[admId]) for admId in admIdList])
    pidSeqMap[pid] = sortedList

    sortedList_3digit = sorted([(admDateMap[admId], admDxMap_3digit[admId]) for admId in admIdList])
    pidSeqMap_3digit[pid] = sortedList_3digit

Building pid-sortedVisits mapping


In [52]:
# 心力衰竭的病人
HF_patient={}
HF_patient_code=[]#含有HF的病人编号
for i in pidSeqMap:
    for j in range(len(pidSeqMap[i])):
        code=pidSeqMap[i][j][1]
        for dx in code:
            if (dx == 'D_"42.80"') or (dx == 'D_"42.81"')or(dx == 'D_"42.82"')or(dx == 'D_"42.83"')or(dx == 'D_"42.84"')or (dx == 'D_"42.89"')or(dx == 'D_"42.820"')or(dx == 'D_"42.821"')or(dx == 'D_"42.822"')or(dx == 'D_"42.823"')or(dx == 'D_"42.830"')or(dx == 'D_"42.831"')or(dx == 'D_"42.832"')or(dx == 'D_"42.833"')or(dx == 'D_"42.840"')or(dx == 'D_"42.841"')or(dx == 'D_"42.842"')or(dx == 'D_"42.843"'):
                if i in HF_patient:
                    continue
                else:
                    HF_patient[i]=pidSeqMap[i]
                    HF_patient_code.append(i)
                    

In [46]:
# 心力衰竭的记录
HF_record={}
record_code=[] #含有HF的住院编号
lastad_record_code=[]
n=0
for i in pidSeqMap:
    for j in range(len(pidSeqMap[i])):
        code=pidSeqMap[i][j][1]
        if j==len(pidSeqMap[i])-1:
            lastad_record_code.append(i)
        for dx in code:
            if (dx == 'D_"42.80"') or (dx == 'D_"42.81"')or(dx == 'D_"42.82"')or(dx == 'D_"42.83"')or(dx == 'D_"42.84"')or (dx == 'D_"42.89"')or(dx == 'D_"42.820"')or(dx == 'D_"42.821"')or(dx == 'D_"42.822"')or(dx == 'D_"42.823"')or(dx == 'D_"42.830"')or(dx == 'D_"42.831"')or(dx == 'D_"42.832"')or(dx == 'D_"42.833"')or(dx == 'D_"42.840"')or(dx == 'D_"42.841"')or(dx == 'D_"42.842"')or(dx == 'D_"42.843"'):
                HF_record[n]=pidSeqMap[i][j]
                n=n+1
                record_code.append(pidAdmMap[i][j])
                break

In [47]:
#HF患者总数
n

13608

In [32]:
# 死亡的HF患者总数
death_code_num=0
for i in range(len(lastad_record_code)):
    if pidDeathMap[lastad_record_code[i]]==1:
        death_code_num+=1

In [33]:
death_code_num

5813

In [66]:
# pidAdmMap映射pid到admid，admDateMap映射admid到admtime
print 'Building admission-BP mapping'
admSBPMap = {}
admDBPMap= {}
infd = open(dataFile+'/CHARTEVENTS.csv', 'r')
infd.readline()
for line in infd:
    tokens = line.strip().split(',')
    admId = int(tokens[2])
    if (admId in record_code):
        if (int(tokens[4])==455):
            if admId in admSBPMap:
                admSBPMap[admId].append(int(tokens[8]))
            else:
                admSBPMap[admId] = [int(tokens[8])]  
        if (int(tokens[4])==8441):
            if admId in admSBPMap:
                admDBPMap[admId].append(int(tokens[8]))
            else:
                admDBPMap[admId] = [int(tokens[8])]  
                 
infd.close()

Building admission-BP mapping


KeyboardInterrupt: 

d_itm/d_labitem

Creatine Kinase: CK (CPK) 225634   /50910
Creatinine: Creatinine 220615/1525   /50912
Lactate Dehydrogenase: LDH 817/220632    /50954
Asparate Aminotransferase: AST 770/220587
CK_MB index: CK-MB 227445  /50908
Alanine Aminotransferase(ALT): urine for AA 6225   /50861
Serum Glutamic-Pyruvic Transaminase : SGPT 3802
Serum Glutamic-Oxaloacetic Transaminase: SGOT 3801
Hematocrit: Hematocrit 813     /51221
Hemoglobin: Hemoglobin 814   /51222
Platelet Count: Platelet Count 6256    /51265
White Blood Cells: WBC 1542/220546    /51301
Red Blood Cells: RBC 833    /51279
mean corpuscular hemoglobin (MCH): 51248
Magnesium: Magnesium 1532/40645/44088/220635   /50960
Potassium: Potassium 1535/41956/44711/42728   /50971
Calcium: Calcium 44441/43747/44855/1522   /50893
Glucose: Glucose 1529    /50931
Chloride: Chloride 1523   /50902
Bicarbonate: Bicarbonate 46362   /50882
Sodium: Sodium 1536   /50983
free calcium: Ionized calcium 816/1350/8177/8325/225667   /50808
Daily Weight: Daily Weight: 163/224639
Heart Rate: Heart Rate 211/220045
Arterial pH: Arterial pH 780
Arterial PaCO2: Arterial PaCO2 778
Arterial PaO2: Arterial PaO2 779