In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from collections import defaultdict

Read data to panda dataframe

In [7]:
# read relevant columns to panda dataframe

# please use your own path for this
path = "./"
admission = pd.read_csv(path + 'ADMISSIONS.csv', usecols=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 
                                                   'DEATHTIME', 'ADMISSION_TYPE', 'DISCHARGE_LOCATION', 'DIAGNOSIS'])
# convert admission time and discharge time death time to correct format
admission.ADMITTIME = pd.to_datetime(admission.ADMITTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
admission.DISCHTIME = pd.to_datetime(admission.DISCHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
admission.DEATHTIME = pd.to_datetime(admission.DEATHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

In [8]:
# sort before group by
admission = admission.sort_values(['SUBJECT_ID','ADMITTIME'])
admission = admission.reset_index(drop = True)

# add the next admission date and type for each subject 
admission['NEXT_ADMITTIME'] = admission.groupby('SUBJECT_ID').ADMITTIME.shift(-1)
admission['NEXT_ADMISSION_TYPE'] = admission.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)

In [9]:
admission = admission.sort_values(['SUBJECT_ID','ADMITTIME'])
# back fill
admission[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = admission.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')
# compute days elapsed until next readmission
admission['DAYS_NEXT_ADMIT']=  (admission.NEXT_ADMITTIME - admission.DISCHTIME).dt.total_seconds()/(24*60*60)
# number of records that were readmitted in less than or equal to 30 days: 3390
records = admission[admission.DAYS_NEXT_ADMIT <= 30]

In [10]:
# read the notes table 
notes = pd.read_csv(path + "NOTEEVENTS.csv")
discharge_sum = notes.loc[notes.CATEGORY == 'Discharge summary']
notes_dis_sum_last = (discharge_sum.groupby(['SUBJECT_ID','HADM_ID']).nth(-1)).reset_index()

# perform a left join of the two datatable
dt_table = pd.merge(admission,notes_dis_sum_last[['SUBJECT_ID','HADM_ID','TEXT']], on = ['SUBJECT_ID','HADM_ID'],how = 'left')
# filter out new born records because a lot of them don't have discharge summary
dt_table = dt_table[dt_table.ADMISSION_TYPE != 'NEWBORN']
# filter out records that do not have discharge summary

# add a column for label
dt_table['LABEL'] = (dt_table.DAYS_NEXT_ADMIT <= 30).astype('int')

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
# Add a column for predicting readmission type:
# A three-class classification problem [No Readmission, Emergency, Elective] = [0, 1, 2]
labelDict = defaultdict(int)
def label_readmission_type(row):
    if row['NEXT_ADMISSION_TYPE'] == 'EMERGENCY':
        labelDict['class 1'] = labelDict['class 1'] + 1
        return 1
    elif row['NEXT_ADMISSION_TYPE'] == 'ELECTIVE':
        labelDict['class 2'] = labelDict['class 2'] + 1
        return 2
    else:
        labelDict['class 0'] = labelDict['class 0'] + 1
        return 0
dt_table['LABELTYPE'] = dt_table.apply(lambda row: label_readmission_type(row), axis=1)

Train Test Split

In [12]:
# create a smaller test set 
# shuffle the dataset first:
dt_table_shuffled = dt_table.sample(n=len(dt_table), random_state=42)
dt_table_shuffled = dt_table_shuffled.reset_index(drop=True)
dt_train = dt_table_shuffled.sample(frac=0.80, random_state=42)
dt_val_test = dt_table_shuffled.drop(dt_train.index)
dt_val = dt_val_test.sample(frac=0.50, random_state=42)
dt_test = dt_val_test.drop(dt_val.index)

# sub-sampling negative data:
posRow = dt_train.LABEL==1
dt_train_pos = dt_train.loc[posRow]
dt_train_neg = dt_train.loc[~posRow]
dt_train_sub = pd.concat([dt_train_pos, dt_train_neg.sample(n=len(dt_train_pos), random_state=42)], axis=0)
# re-shuffle sub-sampled training dataset:
dt_train_sub = dt_train_sub.sample(n=len(dt_train_sub), random_state=42).reset_index(drop=True)

In [13]:
# Data preprocessing:
# Fill missing notes with space and remove CRLF
# Tokenize free-text
import string
import nltk
from nltk import word_tokenize
def my_tokenizer(text):
    # remove +'0123456789'
    punc_list = string.punctuation
    tranTable = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(tranTable)
    tokens = word_tokenize(text)
    return tokens

def preprocess_text(df):
    # This function preprocesses the text by filling not a number and replacing new lines ('\n') and carriage returns ('\r')
    df.TEXT = df.TEXT.fillna(' ')
    return df

print(dt_test.shape)
dt_train_sub = preprocess_text(dt_train_sub)
dt_val = preprocess_text(dt_val)
dt_test = preprocess_text(dt_test)
print (dt_test.shape)

(5111, 15)
(5111, 15)


In [24]:
dt_demo = dt_test[:200]
print (dt_demo.shape)

(200, 15)


## Predicting readmission

In [15]:
# convert tokens into word vectors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
rawCountVec = CountVectorizer(max_features=3000, tokenizer=my_tokenizer, ngram_range=(1, 3), stop_words=stop_words)
tfidfVec = TfidfVectorizer(max_features=3000, ngram_range=(1, 3), tokenizer=my_tokenizer, min_df=3, max_df=0.9, 
                           strip_accents='unicode', use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words=stop_words)
rawX_train = rawCountVec.fit_transform(dt_train_sub.TEXT.values)
rawX_val = rawCountVec.transform(dt_val.TEXT.values)
rawX_test = rawCountVec.transform(dt_test.TEXT.values)
tfidftX_train = tfidfVec.fit_transform(dt_train_sub.TEXT.values)
tfidfX_val = tfidfVec.transform(dt_val.TEXT.values)
tfidfX_test = tfidfVec.transform(dt_test.TEXT.values)

In [35]:
tfidftX_train.shape

(5076, 3000)

Predict Readmission Using SVM

In [22]:
# load models 
# Model 1: SVM with BoW
with open('svm_model.pickle', 'rb') as handle:
    svm_bow = pickle.load(handle)
# Model 2: SVM with TFIDF
with open('svm_model_tfidf.pickle', 'rb') as handle:
    svm_tfidf= pickle.load(handle)

In [30]:
# extracting TFIDF Features
tf_feat = TfidfVectorizer(max_features=380249,ngram_range=(1, 3), tokenizer=my_tokenizer, min_df=3, max_df=0.9, 
                           strip_accents='unicode', use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words=stop_words)
tf_train = tf_feat.fit_transform(dt_train_sub.TEXT.values)
tf_valid = tf_feat.transform(dt_val.TEXT.values)
tf_test = tf_feat.transform(dt_test.TEXT.values)

In [31]:
tf_test.shape

(5111, 380249)

In [40]:
# get some metrics for the first 200 samples 
from sklearn.metrics import accuracy_score, f1_score
# sample demo 
# get label 
y_demo = dt_demo.LABEL.values
# get feature 
x_demo_bow = rawX_test[:200]
x_demo_tfidf = tf_test[:200]
# prediction 
y_pred = svm_bow.predict(x_demo_bow)
svm_bow_accuracy = accuracy_score(y_demo, y_pred)
macroF1 = f1_score(y_demo, y_pred, average='macro')
microF1 = f1_score(y_demo, y_pred, average='micro')
print ("Accuracy for SVM using BOW Feature", svm_bow_accuracy)
print ("Macro F1 for SVM using BOW Feature", macroF1)
print ("Micro F1 for SVM using BOW Feature", microF1)
y_pred = svm_tfidf.predict(x_demo_tfidf)
svm_tfidf_accuracy = accuracy_score(y_demo, y_pred)
print ("Accuracy for SVM using TFIDF Feature", svm_tfidf_accuracy)
macroF1 = f1_score(y_demo, y_pred, average='macro')
microF1 = f1_score(y_demo, y_pred, average='micro')
print ("Macro F1 for SVM using TFIDF Feature", macroF1)
print ("Micro F1 for SVM using TFIDF Feature", microF1)

Accuracy for SVM using BOW Feature 0.335
Macro F1 for SVM using BOW Feature 0.2847731978166761
Micro F1 for SVM using BOW Feature 0.335
Accuracy for SVM using TFIDF Feature 0.94
Macro F1 for SVM using TFIDF Feature 0.4845360824742268
Micro F1 for SVM using TFIDF Feature 0.94


## Predicting readmission type

In [None]:
# # train a baseline Logistic Regression:
# from sklearn.linear_model import LogisticRegression
# model = LogisticRegression(C=0.0002, class_weight='balanced', solver='lbfgs', penalty='l2', random_state=0, tol=1e-6)
# # using raw count vecotrs
# model.fit(rawX_train, y_trainMul)

In [36]:
# load from pickle file:
with open("LogisticRegression_Multiclass.pkl", "rb") as handle:
    lr_multi = pickle.load(handle)
print("Pickle loaded!!")

Pickle loaded!!


In [38]:
from sklearn.metrics import accuracy_score, f1_score
# sample demo 
# get label 
y_demo = dt_demo.LABELTYPE.values
# get feature 
x_demo_bow = rawX_test[:200]
#x_demo_tfidf = tfidfX_test[:20]
# prediction 
y_pred = lr_multi.predict(x_demo_bow)
lr_multi_accuracy = accuracy_score(y_demo, y_pred)
print ("Accuracy for Logistic Regression using BOW Feature", lr_multi_accuracy)
#y_pred = svm_tfidf.predict(x_demo_tfidf)
macroF1 = f1_score(y_demo, y_pred, average='macro')
microF1 = f1_score(y_demo, y_pred, average='micro')
print("Macro F1 score: %02f, Micro F1 score: %02f" % (macroF1, microF1))

Accuracy for Logistic Regression using BOW Feature 0.64
Macro F1 score: 0.338005, Micro F1 score: 0.640000


In [41]:
# Get labels for multiclass classification for validation dataset
# multiclass (3) classification:
y_trainMul = dt_train_sub.LABELTYPE.values
y_valMul = dt_val.LABELTYPE.values
unique, counts = np.unique(y_trainMul, return_counts=True)
uniqueLabelsDict = dict(zip(unique, counts))
print(uniqueLabelsDict)

{0: 2104, 1: 2785, 2: 187}
