Loading Data in pandas (no parallel)

In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing
from sklearn import linear_model
from sklearn.metrics import f1_score

In [2]:
patients = pd.read_csv('../../mimic3/data/PATIENTS.csv')
admissions = pd.read_csv('../../mimic3/data/ADMISSIONS.csv')
# icustays = pd.read_csv('../mimic3/data/ICUSTAYS.csv')
# servs = pd.read_csv('../mimic3/data/SERVICES.csv')
# trans = pd.read_csv('../mimic3/data/TRANSFERS.csv')
# drgCodes = pd.read_csv('../mimic3/data/DRGCODES.csv')
dias = pd.read_csv('../../mimic3/data/DIAGNOSES_ICD.csv')
diag_dict = pd.read_csv('../../mimic3/data/D_ICD_DIAGNOSES.csv')
pres = pd.read_csv('../../mimic3/data/PRESCRIPTIONS.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
features=pd.read_csv('../../mimic3/data/ADMISSIONS.csv',nrows=1)

In [4]:
features.columns.tolist()

['ROW_ID',
 'SUBJECT_ID',
 'HADM_ID',
 'ADMITTIME',
 'DISCHTIME',
 'DEATHTIME',
 'ADMISSION_TYPE',
 'ADMISSION_LOCATION',
 'DISCHARGE_LOCATION',
 'INSURANCE',
 'LANGUAGE',
 'RELIGION',
 'MARITAL_STATUS',
 'ETHNICITY',
 'EDREGTIME',
 'EDOUTTIME',
 'DIAGNOSIS',
 'HOSPITAL_EXPIRE_FLAG',
 'HAS_CHARTEVENTS_DATA']

Get ICD9 codes including "Unspecified".

In [5]:
unspicified_codes = diag_dict[diag_dict.SHORT_TITLE.str.contains('NOS')].ICD9_CODE.tolist()

Get ICD9 codes which happened at least 100 times.

In [6]:
code_values = dias.ICD9_CODE.value_counts().to_frame()
freq_codes = code_values.loc[code_values.ICD9_CODE >= 100].index.tolist()

In [7]:
code_values[code_values.columns.tolist()[0]].tolist()

[20703,
 13111,
 12891,
 12429,
 9119,
 9058,
 8690,
 7497,
 6555,
 6326,
 5930,
 5779,
 5519,
 5406,
 4917,
 4839,
 4552,
 4528,
 4431,
 3912,
 3806,
 3725,
 3680,
 3566,
 3435,
 3431,
 3421,
 3358,
 3278,
 3065,
 3056,
 3055,
 3039,
 2926,
 2811,
 2758,
 2734,
 2725,
 2630,
 2586,
 2550,
 2538,
 2453,
 2380,
 2343,
 2287,
 2272,
 2264,
 2195,
 2169,
 2165,
 2148,
 2121,
 2082,
 2051,
 2016,
 1947,
 1934,
 1926,
 1811,
 1807,
 1709,
 1663,
 1612,
 1584,
 1580,
 1535,
 1511,
 1502,
 1494,
 1490,
 1478,
 1445,
 1444,
 1425,
 1401,
 1397,
 1390,
 1385,
 1384,
 1374,
 1367,
 1361,
 1350,
 1348,
 1327,
 1314,
 1298,
 1289,
 1287,
 1277,
 1272,
 1259,
 1240,
 1224,
 1220,
 1218,
 1217,
 1211,
 1207,
 1202,
 1198,
 1186,
 1171,
 1170,
 1154,
 1143,
 1138,
 1126,
 1109,
 1100,
 1097,
 1093,
 1072,
 1072,
 1068,
 1067,
 1049,
 1043,
 1039,
 1020,
 1020,
 1016,
 1012,
 1007,
 1000,
 997,
 994,
 984,
 978,
 978,
 977,
 977,
 967,
 961,
 954,
 950,
 929,
 924,
 893,
 891,
 881,
 875,
 874,
 865,


Get ICD9 codes which happened at least 100 times and excluding codes with "Unspecified"

In [8]:
freq_codes_ls = list(set(freq_codes) - set(unspicified_codes))

Get Patients' demographic information.

In [13]:
profiles = admissions.drop_duplicates(subset='SUBJECT_ID', keep='last')
profiles = profiles[['SUBJECT_ID','INSURANCE','LANGUAGE','RELIGION','MARITAL_STATUS','ETHNICITY']]
profs = pd.merge(profiles,patients,how='inner',on='SUBJECT_ID').\
            drop(['ROW_ID','DOB','DOD','DOD_HOSP','DOD_SSN','EXPIRE_FLAG'], axis=1)
    
profs = profs.fillna('NoneValue')

In [14]:
profs.head()

Unnamed: 0,SUBJECT_ID,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,GENDER
0,22,Private,NoneValue,UNOBTAINABLE,MARRIED,WHITE,F
1,23,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,M
2,24,Private,NoneValue,PROTESTANT QUAKER,SINGLE,WHITE,M
3,25,Private,NoneValue,UNOBTAINABLE,MARRIED,WHITE,M
4,26,Medicare,NoneValue,CATHOLIC,SINGLE,UNKNOWN/NOT SPECIFIED,M


In [11]:
pid = profs.SUBJECT_ID
profs = profs.drop(['SUBJECT_ID'], axis=1)

One-Hot Data Preprocessing.

In [101]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
profs_2 = profs.apply(le.fit_transform)

enc = preprocessing.OneHotEncoder()
enc.fit(profs_2)
profs_encode = enc.transform(profs_2)

In [102]:
processed_data = pd.DataFrame(profs_encode.todense())
profs_df = pd.concat([pid,processed_data],axis=1)

In [105]:
def classification(dis1, dis2,profs_df):
    
    #data preprocessing
    pat1 = dias[dias.ICD9_CODE==dis1].SUBJECT_ID.unique()
    pat2 = dias[dias.ICD9_CODE==dis2].SUBJECT_ID.unique()
    pat1 = list(set(pat1) - set(pat2))
    pat1_df = pd.DataFrame({'SUBJECT_ID':pat1})
    pat2_df = pd.DataFrame({'SUBJECT_ID':pat2})
    
    #get patients' one-hot data
    profs1 = pd.merge(profs_df,pat1_df,how='inner',on='SUBJECT_ID').drop(['SUBJECT_ID'], axis=1)
    profs2 = pd.merge(profs_df,pat2_df,how='inner',on='SUBJECT_ID').drop(['SUBJECT_ID'], axis=1)
    profs = pd.concat([profs1,profs2],axis=0)
    
    #Assign label to each patient
    pat1 = pd.DataFrame({'label':np.ones(profs1.shape[0])})
    pat2 = pd.DataFrame({'label':np.zeros(profs2.shape[0])})
    labels = pd.concat([pat1,pat2],axis=0)
    
    #data scaling
    data = preprocessing.StandardScaler().fit_transform(profs)
    
    #logistic Classifiction
    logreg = linear_model.LogisticRegression(C=1e5)

    X_train, X_test, y_train, y_test = \
            model_selection.train_test_split(data, labels, test_size=.2, random_state=42)

    logreg.fit(X_train, y_train)
    score = logreg.score(X_test, y_test)
   
    y_pred = logreg.predict(X_test)
    f_score = f1_score(y_test,y_pred)
    
#     print ("Disease: {} and {}, patients' number: {} and {}, accuracy:{}, F1 Score:{}".\
#            format(dis1,dis2,len(profs1),len(profs2),format(score,'.4f'),format(f_score,'.4f')))
    return [dis1,dis2,len(profs1),len(profs2),format(score,'.4f'),format(f_score,'.4f')]

In [15]:
def populateData(dis1, dis2,profs_df):
    
    #data preprocessing
    pat1 = dias[dias.ICD9_CODE==dis1].SUBJECT_ID.unique()
    pat2 = dias[dias.ICD9_CODE==dis2].SUBJECT_ID.unique()
    pat1 = list(set(pat1) - set(pat2))
    pat1_df = pd.DataFrame({'SUBJECT_ID':pat1})
    pat2_df = pd.DataFrame({'SUBJECT_ID':pat2})
    
    #get patients' one-hot data
    profs1 = pd.merge(profs_df,pat1_df,how='inner',on='SUBJECT_ID')
    profs2 = pd.merge(profs_df,pat2_df,how='inner',on='SUBJECT_ID')
    profs = pd.concat([profs1,profs2],axis=0)
    
    #Assign label to each patient
    pat1 = pd.DataFrame({'label':np.ones(profs1.shape[0])})
    pat2 = pd.DataFrame({'label':np.zeros(profs2.shape[0])})
    labels = pd.concat([pat1,pat2],axis=0)
    
    profs_label = pd.concat([profs,labels],axis=1)
    profs_label.to_csv('../../mimic3/data/distinctDisease_'+dis1+'_'+dis2+'.csv',index=False) 

In [19]:
distinctPats = pd.read_csv('../../mimic3/data/processed_combo_f1_1.csv')

for index, row in distinctPats.iterrows():
     populateData(row['code1'], row['code2'],profs)

In [106]:
results = []
length = len(freq_codes_ls)
for idx1 in range(length):
    code1 = freq_codes_ls[idx1]
    for idx2 in range(idx1+1,length):
        code2 = freq_codes_ls[idx2]
        results.append(classification(code1, code2,profs_df))
results_df = pd.DataFrame(results, columns=['code1','code2','no1','no2','accuracy','f1_score'])  
results_df.to_csv('../../mimic3/data/processed_combo.csv',index=False) 

  'recall', 'true', average, warn_for)


In [118]:
f1_1_df = results_df[results_df.f1_score=='1.0000']
# f1_1_df.to_csv('../../mimic3/data/processed_combo_f1_1.csv',index=False) 

In [17]:
f1_1_gap50_df = results_df[(results_df.f1_score=='1.0000') & (abs(results_df.no1-results_df.no2) <= 100)]
# f1_1_gap50.to_csv('../../mimic3/data/processed_combo_f1_1_gap50.csv',index=False) 

NameError: name 'results_df' is not defined

In [138]:
f1_1_gap50_df = f1_1_df

In [139]:
f1_1_gap50_desc_df = pd.merge(f1_1_gap50_df,diag_dict,how='left',\
                              left_on='code1',right_on='ICD9_CODE').drop(['ROW_ID','ICD9_CODE'], axis=1)
f1_1_gap50_desc_df = f1_1_gap50_desc_df.rename(columns={"SHORT_TITLE": "SHORT_TITLE_1", "LONG_TITLE": "LONG_TITLE_1"})

f1_1_gap50_desc_df = pd.merge(f1_1_gap50_desc_df,diag_dict,how='left',\
                              left_on='code2',right_on='ICD9_CODE').drop(['ROW_ID','ICD9_CODE'], axis=1)
f1_1_gap50_desc_df = f1_1_gap50_desc_df.rename(columns={"SHORT_TITLE": "SHORT_TITLE_2", "LONG_TITLE": "LONG_TITLE_2"})

In [140]:
f1_1_gap50_df.shape, f1_1_gap50_desc_df.shape

((1434, 6), (1434, 10))

In [142]:
f1_1_gap50_desc_df.to_csv('../../mimic3/data/processed_combo_f1_1.csv',index=False) 

In [24]:
#Assign label to each patient
pat1 = pd.DataFrame({'label':np.ones(500),'Gender':np.ones(500),'Age': np.random.randint(70,100, size=500)})
pat2 = pd.DataFrame({'label':np.zeros(500),'Gender':np.zeros(500),'Age': np.random.randint(20,40, size=500)})
labels = pd.concat([pat1,pat2],axis=0)
labels.to_csv('../../mimic3/data/sythetic_data.csv',index=False) 

In [23]:
 np.random.randint(70,100, size=500)

array([79, 96, 82, 70, 70, 83, 81, 88, 84, 90, 84, 73, 80, 83, 89, 94, 90,
       88, 79, 72, 70, 79, 78, 83, 76, 84, 70, 87, 88, 88, 73, 82, 72, 79,
       99, 70, 90, 88, 98, 87, 85, 99, 81, 71, 80, 86, 70, 81, 82, 71, 99,
       72, 80, 81, 76, 95, 92, 73, 70, 80, 88, 96, 87, 72, 98, 89, 84, 70,
       81, 98, 91, 93, 95, 87, 98, 86, 79, 91, 79, 85, 95, 95, 73, 78, 72,
       91, 99, 77, 76, 70, 74, 86, 71, 83, 84, 80, 90, 93, 76, 92, 91, 77,
       91, 78, 99, 91, 94, 86, 83, 98, 79, 72, 77, 92, 99, 82, 85, 76, 81,
       92, 92, 78, 89, 92, 73, 77, 86, 74, 74, 89, 70, 80, 78, 99, 82, 81,
       93, 70, 98, 75, 73, 98, 80, 83, 91, 88, 98, 97, 73, 71, 71, 98, 93,
       73, 74, 94, 88, 88, 98, 74, 78, 75, 75, 78, 71, 72, 83, 86, 74, 75,
       87, 97, 82, 73, 87, 85, 93, 70, 81, 99, 72, 73, 86, 89, 78, 71, 82,
       76, 93, 81, 74, 90, 71, 89, 95, 78, 76, 87, 95, 93, 91, 85, 84, 79,
       75, 95, 71, 87, 90, 78, 82, 87, 98, 90, 81, 89, 97, 81, 92, 96, 76,
       82, 85, 93, 76, 87