In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('clean_admission.csv')

In [3]:
# creating training and testing datasets

# shuffle the samples
df = df.sample(n = len(df), random_state = 24)
df = df.reset_index(drop = True)

# Save 20% of the data as test data 
df_test=df.sample(frac=0.20,random_state=24)

# use the rest of the data as training data
df_train=df.drop(df_test.index)

In [4]:
df_train.OUTPUT_LABEL.value_counts()

0    38476
1     2414
Name: OUTPUT_LABEL, dtype: int64

In [5]:
df_test.OUTPUT_LABEL.value_counts()

0    9633
1     590
Name: OUTPUT_LABEL, dtype: int64

In [6]:
# This function preprocesses the text by filling not a number and replacing new lines ('\n') and carriage returns ('\r')

def preprocess_text(df):
    df.TEXT = df.TEXT.fillna(' ')
    df.TEXT =df.TEXT.str.replace('\n',' ')
    df.TEXT =df.TEXT.str.replace('\r',' ')
    return df

# preprocess the text to deal with known issues

df_train = preprocess_text(df_train)
df_test = preprocess_text(df_test)

In [9]:
df = preprocess_text(df)

In [29]:
# building a tokenizer

import nltk
from nltk import word_tokenize
import string

# tokenize the text by replacing punctuation and numbers with spaces and lowercase all words

def tokenizer(text):
    
    punc_list = string.punctuation+'0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(t)
    tokens = word_tokenize(text)
    return tokens

# building a vectorizer

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(tokenizer = tokenizer)

# vectorizer on clinical data

my_stop_words = ['the','and','to','of','was','with','a','on','in','for','name',
                 'is','patient','s','he','at','as','or','one','she','his','her','am',
                 'were','you','pt','pm','by','be','had','your','this','date',
                'from','there','an','that','p','are','have','has','h','but','o',
                'namepattern','which','every','also']

vect = CountVectorizer(max_features = 3000, 
                       tokenizer = tokenizer, 
                       stop_words = my_stop_words)

# this could take a while

vect.fit(df.TEXT.values)



In [30]:
# Transform notes into the vector format

X_train_tf = vect.transform(df_train.TEXT.values)
X_test_tf = vect.transform(df_test.TEXT.values)

In [31]:
# Get labels

y_train = df_train.OUTPUT_LABEL
y_test = df_test.OUTPUT_LABEL

### Logistic Regression:

In [14]:
# logistic regression

from sklearn.linear_model import LogisticRegression

log_reg_clf=LogisticRegression(C = 0.0001, solver = 'newton-cg', penalty = 'l2', random_state = 24, class_weight='balanced', max_iter = 1000)

log_reg_clf.fit(X_train_tf, y_train)

In [15]:
y_train_preds = log_reg_clf.predict_proba(X_train_tf)[:,1]
y_test_preds = log_reg_clf.predict_proba(X_test_tf)[:,1]

In [16]:
y_train_preds

array([0.37690956, 0.31588395, 0.49416569, ..., 0.38653701, 0.39207667,
       0.47027307])

In [17]:
y_test_preds

array([0.26754029, 0.2403216 , 0.63783476, ..., 0.63022949, 0.48842231,
       0.48697858])

In [18]:
y_train_preds2 = log_reg_clf.predict(X_train_tf)

from sklearn.metrics import classification_report

print(classification_report(y_train, y_train_preds2))

              precision    recall  f1-score   support

           0       0.97      0.77      0.86     38476
           1       0.16      0.67      0.25      2414

    accuracy                           0.77     40890
   macro avg       0.56      0.72      0.56     40890
weighted avg       0.93      0.77      0.83     40890



In [20]:
y_test_preds3 = log_reg_clf.predict(X_test_tf)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_preds3))

              precision    recall  f1-score   support

           0       0.97      0.76      0.85      9633
           1       0.13      0.56      0.21       590

    accuracy                           0.75     10223
   macro avg       0.55      0.66      0.53     10223
weighted avg       0.92      0.75      0.81     10223



### Decision Tree:

In [32]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier(criterion='gini', random_state=1)
tree_model.fit(X_train_tf, y_train)

In [33]:
y_train_preds_dt = tree_model.predict_proba(X_train_tf)[:,1]
y_test_preds_dt = tree_model.predict_proba(X_test_tf)[:,1]

In [34]:
y_train_preds_dt

array([0.        , 0.        , 0.        , ..., 0.        , 0.03000612,
       0.        ])

In [35]:
y_test_preds_dt

array([0., 0., 1., ..., 0., 0., 0.])

In [36]:
y_train_preds2_dt = tree_model.predict(X_train_tf)

from sklearn.metrics import classification_report

print(classification_report(y_train, y_train_preds2_dt))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     38476
           1       1.00      0.98      0.99      2414

    accuracy                           1.00     40890
   macro avg       1.00      0.99      0.99     40890
weighted avg       1.00      1.00      1.00     40890



In [37]:
y_test_preds3_dt = tree_model.predict(X_test_tf)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_preds3_dt))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94      9633
           1       0.11      0.12      0.11       590

    accuracy                           0.89     10223
   macro avg       0.53      0.53      0.53     10223
weighted avg       0.90      0.89      0.90     10223



### User Input:

In [38]:
report = '''ALLERGIES: ACCUTANE.

SOCIAL HISTORY: Denies smoking or drinking.

PHYSICAL EXAMINATION: Palpable pedal pulses noted bilaterally. Capillary refill time less than 3 seconds, digits 1 through 5 bilateral. Skin supple and intact with positive hair growth. Epicritic sensation intact bilateral. Muscle strength +5/5, dorsiflexors, plantar flexors, invertors, evertors. Left foot with erythema, edema, positive tenderness noted, left forefoot area.

LABORATORY: White blood cell count never was abnormal. The remaining within normal limits. X-ray is negative for osteomyelitis. On 06/14/07, the patient was taken to the OR for incision and drainage of left foot abscess. The patient tolerated the procedure well and was admitted and placed on vancomycin 1 g q.12h after surgery and later changed Ancef 2 g IV every 8 hours. Postop wound care consists of Aquacel Ag and dry dressing to the surgical site everyday and the patient remains nonweightbearing on the left foot. The patient progressively improved with IV antibiotics and local wound care and was discharged from the hospital on 06/19/07 in excellent condition.

DISCHARGE MEDICATIONS: Lorcet 10/650 mg, dispense 24 tablets, one tablet to be taken by mouth q.6h as needed for pain. The patient was continued on Ancef 2 g IV via PICC line and home health administration of IV antibiotics.

DISCHARGE INSTRUCTIONS: Included keeping the foot elevated with long periods of rest. The patient is to wear surgical shoe at all times for ambulation and to avoid excessive ambulation. The patient to keep dressing dry and intact, left foot. The patient to contact Dr. X for all followup care, if any problems arise. The patient was given written and oral instruction about wound care before discharge. Prior to discharge, the patient was noted to be afebrile. All vitals were stable. The patient's questions were answered and the patient was discharged in apparent satisfactory condition. Followup care was given via Dr. X' office.'''

In [40]:
processed_report = report.replace('\n',' ')
processed_report = processed_report.replace('\r',' ')
processed_report

"ALLERGIES: ACCUTANE.  SOCIAL HISTORY: Denies smoking or drinking.  PHYSICAL EXAMINATION: Palpable pedal pulses noted bilaterally. Capillary refill time less than 3 seconds, digits 1 through 5 bilateral. Skin supple and intact with positive hair growth. Epicritic sensation intact bilateral. Muscle strength +5/5, dorsiflexors, plantar flexors, invertors, evertors. Left foot with erythema, edema, positive tenderness noted, left forefoot area.  LABORATORY: White blood cell count never was abnormal. The remaining within normal limits. X-ray is negative for osteomyelitis. On 06/14/07, the patient was taken to the OR for incision and drainage of left foot abscess. The patient tolerated the procedure well and was admitted and placed on vancomycin 1 g q.12h after surgery and later changed Ancef 2 g IV every 8 hours. Postop wound care consists of Aquacel Ag and dry dressing to the surgical site everyday and the patient remains nonweightbearing on the left foot. The patient progressively improve

In [45]:
report_list = []
report_list.append(processed_report)

In [46]:
X_report = vect.transform(report_list)

In [47]:
y_report = log_reg_clf.predict_proba(X_report)[:,1]

In [48]:
y_report

array([0.43859129])

In [52]:
print('The patient has a ' + str(y_report[0]*100) + '% of readmission within 30 days.')

The patient has a 43.859128535316216% of readmission within 30 days.


In [53]:
import pickle

# Save the Modle to file in the current working directory

pkl_Filename = "log_reg_model.pkl"  

with open(pkl_Filename, 'wb') as file:  
    pickle.dump(log_reg_clf, file)

In [54]:
# Load the Model back from file
with open("log_reg_model.pkl", 'rb') as file:  
    loaded_model = pickle.load(file)

loaded_model