In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection
from sklearn import ensemble
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import pandas as pd

Read in discharge summary notes and labels for general readmission from .txt files 

In [None]:
with open('CS598_DATA/x.txt') as fd:
    note = fd.read().split('\n')

with open('CS598_DATA/y.txt') as fd:
    label = fd.read().split('\n')

Read in discharge summary notes and labels for 30-day readmission from .txt files 

In [None]:
with open('CS598_DATA/x_30.txt') as fd:
    note_30 = fd.read().split('\n')

with open('CS598_DATA/y_30.txt') as fd:
    label_30 = fd.read().split('\n')

Remove numbers and sepcial characters from each discharge summary notes for general readmission

In [None]:
for i in range(len(note)):
    words = note[i].split()
    words = [word for word in words if word.isalpha()]
    note[i] = ' '.join(words)

Remove numbers and sepcial characters from each discharge summary notes for 30-day general readmission

In [None]:
for i in range(len(note_30)):
    words_30 = note_30[i].split()
    words_30 = [word for word in words_30 if word.isalpha()]
    note_30[i] = ' '.join(words_30)

Create Dataframe for general readmission

In [None]:
data = pd.DataFrame()
data['note'] = note
data['label'] = label

Create Dataframe for 30-day readmission

In [None]:
data_30 = pd.DataFrame()
data_30['note'] = note_30
data_30['label'] = label_30

Split train and test data with train_size=0.9 for general readmission

In [None]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['note'], data['label'], train_size=0.9)

Split train and test data with train_size=0.9 for 30-day readmission

In [None]:
train_x_30, test_x_30, train_y_30, test_y_30 = model_selection.train_test_split(data_30['note'], data_30['label'], train_size=0.9)

Construct features using term frequency-inverse document frequency (TF-IDF) for general readmission

In [None]:
TFIDF = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
TFIDF.fit(data['note'])
train_x = TFIDF.transform(train_x)
test_x = TFIDF.transform(test_x)

Construct features using term frequency-inverse document frequency (TF-IDF) for 30-day readmission

In [None]:
TFIDF_30 = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
TFIDF_30.fit(data_30['note'])
train_x_30 = TFIDF_30.transform(train_x_30)
test_x_30 = TFIDF_30.transform(test_x_30)

Initialize Random Forest Classifier and predict on test data for general readmission

In [None]:
model = ensemble.RandomForestClassifier().fit(train_x, train_y)
y_hat = model.predict(test_x)

Initialize Random Forest Classifier and predict on test data for 30-day readmission

In [None]:
model_30 = ensemble.RandomForestClassifier().fit(train_x_30, train_y_30)
y_hat_30 = model_30.predict(test_x_30)

Show model performance on test data for general readmission prediction 

In [None]:
print(f"Precision Score: {precision_score(test_y, y_hat, pos_label='1')}")
print(f"Recall Score: {recall_score(test_y, y_hat, pos_label='1')}")
print(f"F1 Score: {f1_score(test_y, y_hat, pos_label='1')}")
print(f"Accuracy Score: {accuracy_score(test_y, y_hat)}")

Show model performance on test data for 30-day readmission prediction

In [None]:
print(f"Precision Score: {precision_score(test_y_30, y_hat_30, pos_label='1')}")
print(f"Recall Score: {recall_score(test_y_30, y_hat_30, pos_label='1')}")
print(f"F1 Score: {f1_score(test_y_30, y_hat_30, pos_label='1')}")
print(f"Accuracy Score: {accuracy_score(test_y_30, y_hat_30)}")

Test number of features for model from 10000 to 25000 with a step of 5000 for general readmission

In [None]:
for features in range(10000, 25001, 5000):
    model = ensemble.RandomForestClassifier(max_features=features).fit(train_x, train_y)
    y_hat = model.predict(test_x)
    print(features)
    print(f"Precision Score: {precision_score(test_y, y_hat, pos_label='1')}")
    print(f"Recall Score: {recall_score(test_y, y_hat, pos_label='1')}")
    print(f"F1 Score: {f1_score(test_y, y_hat, pos_label='1')}")
    print(f"Accuracy Score: {accuracy_score(test_y, y_hat)}")

Test number of features for model from 10000 to 20000 with a step of 5000 for 30-day readmission

In [None]:
for features in range(10000, 20001, 5000):
    model_30 = ensemble.RandomForestClassifier(max_features=features).fit(train_x_30, train_y_30)
    y_hat_30 = model_30.predict(test_x_30)
    print(features)
    print(f"Precision Score: {precision_score(test_y_30, y_hat_30, pos_label='1')}")
    print(f"Recall Score: {recall_score(test_y_30, y_hat_30, pos_label='1')}")
    print(f"F1 Score: {f1_score(test_y_30, y_hat_30, pos_label='1')}")
    print(f"Accuracy Score: {accuracy_score(test_y_30, y_hat_30)}")