Loading Data in pandas (no parallel)

In [94]:
import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing
from sklearn import linear_model
from sklearn.metrics import f1_score

In [95]:
patients = pd.read_csv('../../mimic3/data/PATIENTS.csv')
admissions = pd.read_csv('../../mimic3/data/ADMISSIONS.csv')
# icustays = pd.read_csv('../mimic3/data/ICUSTAYS.csv')
# servs = pd.read_csv('../mimic3/data/SERVICES.csv')
# trans = pd.read_csv('../mimic3/data/TRANSFERS.csv')
# drgCodes = pd.read_csv('../mimic3/data/DRGCODES.csv')
dias = pd.read_csv('../../mimic3/data/DIAGNOSES_ICD.csv')
diag_dict = pd.read_csv('../../mimic3/data/D_ICD_DIAGNOSES.csv')
pres = pd.read_csv('../../mimic3/data/PRESCRIPTIONS.csv')

Get ICD9 codes including "Unspecified".

In [96]:
unspicified_codes = diag_dict[diag_dict.SHORT_TITLE.str.contains('NOS')].ICD9_CODE.tolist()

Get ICD9 codes which happened at least 100 times.

In [97]:
code_values = dias.ICD9_CODE.value_counts().to_frame()
freq_codes = code_values.loc[code_values.ICD9_CODE >= 100].index.tolist()

Get ICD9 codes which happened at least 100 times and excluding codes with "Unspecified"

In [98]:
freq_codes_ls = list(set(freq_codes) - set(unspicified_codes))

Get Patients' demographic information.

In [99]:
profiles = admissions.drop_duplicates(subset='SUBJECT_ID', keep='last')
profiles = profiles[['SUBJECT_ID','INSURANCE','LANGUAGE','RELIGION','MARITAL_STATUS','ETHNICITY']]
profs = pd.merge(profiles,patients,how='inner',on='SUBJECT_ID').\
            drop(['ROW_ID','DOB','DOD','DOD_HOSP','DOD_SSN','EXPIRE_FLAG'], axis=1)
    
profs = profs.fillna('NoneValue')

In [100]:
pid = profs.SUBJECT_ID
profs = profs.drop(['SUBJECT_ID'], axis=1)

One-Hot Data Preprocessing.

In [101]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
profs_2 = profs.apply(le.fit_transform)

enc = preprocessing.OneHotEncoder()
enc.fit(profs_2)
profs_encode = enc.transform(profs_2)

In [102]:
processed_data = pd.DataFrame(profs_encode.todense())
profs_df = pd.concat([pid,processed_data],axis=1)

In [105]:
def classification(dis1, dis2,profs_df):
    
    #data preprocessing
    pat1 = dias[dias.ICD9_CODE==dis1].SUBJECT_ID.unique()
    pat2 = dias[dias.ICD9_CODE==dis2].SUBJECT_ID.unique()
    pat1 = list(set(pat1) - set(pat2))
    pat1_df = pd.DataFrame({'SUBJECT_ID':pat1})
    pat2_df = pd.DataFrame({'SUBJECT_ID':pat2})
    
    #get patients' one-hot data
    profs1 = pd.merge(profs_df,pat1_df,how='inner',on='SUBJECT_ID').drop(['SUBJECT_ID'], axis=1)
    profs2 = pd.merge(profs_df,pat2_df,how='inner',on='SUBJECT_ID').drop(['SUBJECT_ID'], axis=1)
    profs = pd.concat([profs1,profs2],axis=0)
    
    #Assign label to each patient
    pat1 = pd.DataFrame({'label':np.ones(profs1.shape[0])})
    pat2 = pd.DataFrame({'label':np.zeros(profs2.shape[0])})
    labels = pd.concat([pat1,pat2],axis=0)
    
    #data scaling
    data = preprocessing.StandardScaler().fit_transform(profs)
    
    #logistic Classifiction
    logreg = linear_model.LogisticRegression(C=1e5)

    X_train, X_test, y_train, y_test = \
            model_selection.train_test_split(data, labels, test_size=.2, random_state=42)

    logreg.fit(X_train, y_train)
    score = logreg.score(X_test, y_test)
   
    y_pred = logreg.predict(X_test)
    f_score = f1_score(y_test,y_pred)
    
#     print ("Disease: {} and {}, patients' number: {} and {}, accuracy:{}, F1 Score:{}".\
#            format(dis1,dis2,len(profs1),len(profs2),format(score,'.4f'),format(f_score,'.4f')))
    return [dis1,dis2,len(profs1),len(profs2),format(score,'.4f'),format(f_score,'.4f')]

In [106]:
results = []
length = len(freq_codes_ls)
for idx1 in range(length):
    code1 = freq_codes_ls[idx1]
    for idx2 in range(idx1+1,length):
        code2 = freq_codes_ls[idx2]
        results.append(classification(code1, code2,profs_df))
results_df = pd.DataFrame(results, columns=['code1','code2','no1','no2','accuracy','f1_score'])  
results_df.to_csv('../../mimic3/data/processed_combo.csv',index=False) 

  'recall', 'true', average, warn_for)


In [118]:
f1_1_df = results_df[results_df.f1_score=='1.0000']
# f1_1_df.to_csv('../../mimic3/data/processed_combo_f1_1.csv',index=False) 

In [133]:
f1_1_gap50_df = results_df[(results_df.f1_score=='1.0000') & (abs(results_df.no1-results_df.no2) <= 100)]
# f1_1_gap50.to_csv('../../mimic3/data/processed_combo_f1_1_gap50.csv',index=False) 

In [138]:
f1_1_gap50_df = f1_1_df

In [139]:
f1_1_gap50_desc_df = pd.merge(f1_1_gap50_df,diag_dict,how='left',\
                              left_on='code1',right_on='ICD9_CODE').drop(['ROW_ID','ICD9_CODE'], axis=1)
f1_1_gap50_desc_df = f1_1_gap50_desc_df.rename(columns={"SHORT_TITLE": "SHORT_TITLE_1", "LONG_TITLE": "LONG_TITLE_1"})

f1_1_gap50_desc_df = pd.merge(f1_1_gap50_desc_df,diag_dict,how='left',\
                              left_on='code2',right_on='ICD9_CODE').drop(['ROW_ID','ICD9_CODE'], axis=1)
f1_1_gap50_desc_df = f1_1_gap50_desc_df.rename(columns={"SHORT_TITLE": "SHORT_TITLE_2", "LONG_TITLE": "LONG_TITLE_2"})

In [140]:
f1_1_gap50_df.shape, f1_1_gap50_desc_df.shape

((1434, 6), (1434, 10))

In [142]:
f1_1_gap50_desc_df.to_csv('../../mimic3/data/processed_combo_f1_1.csv',index=False) 