In [1]:
## Import Libraries

import pandas as pd
import numpy as np
import string
import re
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE    

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 

from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")

In [2]:
## Load data
mtsamples_df = pd.read_csv("../data/raw/mtsamples.csv")

In [3]:
## General Data Cleaning
mtsamples_df = mtsamples_df.dropna()
mtsamples_df = mtsamples_df.drop_duplicates() 

In [4]:
## Data Preprocessing
data_categories  = mtsamples_df.groupby(mtsamples_df['medical_specialty'])
# Only use medical specialties with more than 100 samples
filtered_data_categories = data_categories.filter(lambda x:x.shape[0] > 100)
final_data_categories = filtered_data_categories.groupby(filtered_data_categories['medical_specialty'])
data = filtered_data_categories[['transcription', 'medical_specialty']]

In [5]:
# Feature Engineering functions for text (taken from Kaggle)
def clean_text(text ): 
    text = text.translate(str.maketrans('', '', string.punctuation))
    text1 = ''.join([w for w in text if not w.isdigit()]) 
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    #BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    
    text2 = text1.lower()
    text2 = REPLACE_BY_SPACE_RE.sub('', text2) # replace REPLACE_BY_SPACE_RE symbols by space in text
    #text2 = BAD_SYMBOLS_RE.sub('', text2)
    return text2

def lemmatize_text(text):
    wordlist=[]
    lemmatizer = WordNetLemmatizer() 
    sentences=sent_tokenize(text)
    
    intial_sentences= sentences[0:1]
    final_sentences = sentences[len(sentences)-2: len(sentences)-1]
    
    for sentence in intial_sentences:
        words=word_tokenize(sentence)
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))
    for sentence in final_sentences:
        words=word_tokenize(sentence)
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))       
    return ' '.join(wordlist) 

In [6]:
# Apply Feature Engineering (taken from Kaggle)
data['transcription'] = data['transcription'].apply(lemmatize_text)
data['transcription'] = data['transcription'].apply(clean_text)

In [7]:
data['transcription']

3       d mmode     mild mitral and tricuspid regurgit...
4         there is a color doppler suggestive of a pat...
7       d echocardiogram  multiple view of the heart a...
9             description    normal lv systolic function 
11                d study  moderate biatrial enlargement 
                              ...                        
4967    exam   left heart cath  selective coronary ang...
4968    indication   acute coronary syndrome  consent ...
4971    angina  is chest pain due to a lack of oxygen ...
4972    indication   chest pain  type of test   adenos...
4975    history of present illness   mr abc is a yearo...
Name: transcription, Length: 2947, dtype: object

In [8]:
# Vectorize text and create sparse matrix
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',ngram_range=(1,3), max_df=0.75, use_idf=True, smooth_idf=True, max_features=1000)
tfIdfMat  = vectorizer.fit_transform(data['transcription'].tolist())
tfIdfMat.shape

(2947, 1000)

In [11]:
# Create X and y and apply PCA to reduce dimensionality of features
pca = PCA(n_components=0.95)
tfIdfMat_reduced = pca.fit_transform(tfIdfMat.toarray())
labels = data['medical_specialty'].tolist()
category_list = data.medical_specialty.unique()
category_list

array([' Cardiovascular / Pulmonary', ' Urology', ' General Medicine',
       ' Surgery', ' SOAP / Chart / Progress Notes', ' Radiology',
       ' Orthopedic', ' Obstetrics / Gynecology', ' Neurology',
       ' Gastroenterology', ' Consult - History and Phy.'], dtype=object)

In [12]:
tfIdfMat_reduced

array([[ 0.05420611,  0.01511574,  0.10898833, ...,  0.00325415,
         0.00301091, -0.01435424],
       [ 0.0316351 ,  0.01493548,  0.08139049, ..., -0.02015478,
        -0.01930347,  0.00995442],
       [ 0.05076146,  0.00193637,  0.17429783, ...,  0.01664893,
         0.01834395,  0.02163482],
       ...,
       [ 0.0862155 ,  0.01382344,  0.12266943, ...,  0.00074459,
         0.01424753, -0.0073192 ],
       [ 0.06285067, -0.00218799,  0.15498886, ..., -0.01984566,
        -0.02771554,  0.01695366],
       [ 0.27695904,  0.06378962, -0.04942733, ..., -0.01779754,
        -0.00940408,  0.00705204]])

In [12]:
# Create a class to select numerical or categorical columns 
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [None]:
# include all data cleaning steps in pipeline (?)
#cat_attribs = tfIdfMat_reduced

#cat_pipeline = Pipeline([
#        ('step1', DataFrameSelector(cat_attribs)),
#        ('step2', StandardScaler()),
#        ('selector', DataFrameSelector(cat_attribs)),
#        ('cat_encoder', OneHotEncoder(sparse=False)),])

In [None]:
#mt_features = cat_pipeline.fit_transform(mtsamples_df)
#mt_features.shape    

In [14]:
# Split data into train and test

X_train, X_test, y_train, y_test = train_test_split(tfIdfMat_reduced, labels, test_size=0.2, random_state=42)

In [15]:
# build and train model
# default solver would be lbfgs, but it does not support l1_ratio which combines l1 and l2 regularization
clf = LogisticRegression(random_state=0, penalty= 'elasticnet', solver= 'saga', l1_ratio=0.5, multi_class='multinomial')
lr = Pipeline(steps=[('classifier', clf)]).fit(X_train, y_train)

In [41]:
# predict and evaluate
y_pred = lr.predict(X_test)
print('Very poor performance of model with an accuracy of only 0.45 and very low f1-scores')
print(classification_report(y_test, y_pred, target_names=category_list))

Very poor performance of model with an accuracy of only 0.45 and very low f1-scores
                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.32      0.22      0.26        50
                       Urology       0.42      0.51      0.46        47
              General Medicine       0.33      0.08      0.12        39
                       Surgery       0.33      0.21      0.26        33
 SOAP / Chart / Progress Notes       0.35      0.36      0.35        25
                     Radiology       0.00      0.00      0.00        27
                    Orthopedic       0.24      0.12      0.16        52
       Obstetrics / Gynecology       0.48      0.43      0.45        54
                     Neurology       0.29      0.34      0.32        29
              Gastroenterology       0.55      0.87      0.68       207
    Consult - History and Phy.       1.00      0.11      0.20        27

                      accuracy                    

In [85]:
# finetune model and choosing best C, solver, penalty with grid search
param_grid = {'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'classifier__solver': ['saga', 'lbfgs', 'liblinear'], 'classifier__penalty': ['none', 'l1', 'l2', 'elasticnet']}
grid_search = GridSearchCV(lr, param_grid, cv=5)
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'classifier__C': 1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}

In [16]:
clf = LogisticRegression(random_state=42, penalty= 'l1', solver= 'saga', multi_class='multinomial', C=1)
lr = Pipeline(steps=[('classifier', clf)]).fit(X_train, y_train)
# predict and evaluate
y_pred = lr.predict(X_test)
print('Model performance only improves slightly in weighted average after finetuning')
print(classification_report(y_test, y_pred, target_names=category_list))

# classfifcation report function
def classification_report_with_accuracy_score(y_test, y_pred):
    category_list = data.medical_specialty.unique()
    return classification_report(y_test, y_pred, target_names=category_list)

Model performance only improves slightly in weighted average after finetuning
                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.32      0.20      0.25        50
                       Urology       0.35      0.47      0.40        47
              General Medicine       0.45      0.13      0.20        39
                       Surgery       0.32      0.24      0.28        33
 SOAP / Chart / Progress Notes       0.35      0.32      0.33        25
                     Radiology       0.00      0.00      0.00        27
                    Orthopedic       0.28      0.17      0.21        52
       Obstetrics / Gynecology       0.44      0.41      0.42        54
                     Neurology       0.32      0.34      0.33        29
              Gastroenterology       0.56      0.86      0.68       207
    Consult - History and Phy.       1.00      0.11      0.20        27

                      accuracy                          

In [17]:
# applying some domain knowlegde to improve model performance
# show categories with count of samples
data_categories = data.groupby(data['medical_specialty'])
print('There are certain generic categories like surgery, SOAP, or Consult which are not very specific and the dataset is very unbalanced')
data_categories.count()

There are certain generic categories like surgery, SOAP, or Consult which are not very specific and the dataset is very unbalanced


Unnamed: 0_level_0,transcription
medical_specialty,Unnamed: 1_level_1
Cardiovascular / Pulmonary,276
Consult - History and Phy.,220
Gastroenterology,193
General Medicine,137
Neurology,162
Obstetrics / Gynecology,128
Orthopedic,296
Radiology,248
SOAP / Chart / Progress Notes,140
Surgery,1008


In [46]:
# balance out dataset with SMOTE, creates synthetic samples of the minority classes
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(tfIdfMat_reduced, labels)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('Model performance improves significantly after balancing dataset with SMOTE')
print('However some categories like Gastrientology or Obstetrics / Gynecology are still not predicted well')
print(classification_report(y_test, y_pred, target_names=category_list))

Model performance improves significantly after balancing dataset with SMOTE
However some categories like Gastrientology are still not predicted well
                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.67      0.71      0.69       189
                       Urology       0.65      0.60      0.63       207
              General Medicine       0.74      0.79      0.76       198
                       Surgery       0.64      0.75      0.69       218
 SOAP / Chart / Progress Notes       0.59      0.70      0.64       177
                     Radiology       0.78      0.92      0.84       191
                    Orthopedic       0.68      0.64      0.66       206
       Obstetrics / Gynecology       0.59      0.49      0.53       218
                     Neurology       0.67      0.65      0.66       214
              Gastroenterology       0.36      0.23      0.28       197
    Consult - History and Phy.       0.79      0.82      0

In [47]:
# ROC curve
# Generate class membership probabilities
y_preb_probs = lr.predict_proba(X_test)
score = roc_auc_score(y_test, y_preb_probs, average="weighted", multi_class="ovr")
print('Average AUROC score of', round(score,4))

# ROC score function
def roc_auc_score_multiclass():
    y_preb_probs = lr.predict_proba(X_test)
    score = roc_auc_score(y_test, y_preb_probs, average="weighted", multi_class="ovr")
    return('Average AUROC score of', round(score,4))


Average AUROC score of 0.9552


In [49]:
# version of accuracy that also integrates measurements of chance and class imbalance
# Generally, a score above 0.8 is considered excellent
cohen_kappa_score(y_test, y_pred)

0.6294970150158223

In [50]:
# table with predictions
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=category_list, columns=category_list)
index = cm_df.index
index.name = 'Actual'
cm_df['Total'] = cm_df.apply(lambda x: x.sum(), axis=1)
print('Predictions for each category')
cm_df

Predictions for each category


Unnamed: 0_level_0,Cardiovascular / Pulmonary,Urology,General Medicine,Surgery,SOAP / Chart / Progress Notes,Radiology,Orthopedic,Obstetrics / Gynecology,Neurology,Gastroenterology,Consult - History and Phy.,Total
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Cardiovascular / Pulmonary,135,7,2,4,0,3,1,14,6,17,0,189
Urology,0,125,2,40,13,3,5,0,13,0,6,207
General Medicine,3,3,157,1,0,3,1,13,1,14,2,198
Surgery,1,19,0,164,3,0,0,0,31,0,0,218
SOAP / Chart / Progress Notes,1,8,1,3,124,2,6,18,2,11,1,177
Radiology,3,1,4,0,0,175,0,2,2,2,2,191
Orthopedic,0,4,1,4,13,4,132,19,2,25,2,206
Obstetrics / Gynecology,35,1,8,0,48,10,4,106,0,5,1,218
Neurology,2,17,1,36,4,0,8,0,140,0,6,214
Gastroenterology,21,0,37,1,4,22,34,4,3,46,25,197


In [54]:
# Predict probabilities for a sample from test set
prob_array = lr.predict_proba(X_test)[0,:]
prob_df = pd.DataFrame(prob_array, index=category_list, columns=['Probability']).sort_values(by='Probability', ascending=False)
prob_df

Unnamed: 0,Probability
Surgery,0.566344
Urology,0.266607
Neurology,0.066584
Consult - History and Phy.,0.023439
Orthopedic,0.01887
SOAP / Chart / Progress Notes,0.017109
General Medicine,0.015937
Cardiovascular / Pulmonary,0.012332
Radiology,0.006744
Gastroenterology,0.005774


In [18]:
# function to predict category for a given text
def predict_category(sample):
    sample = lemmatize_text(sample)
    sample = clean_text(sample)
    sample = vectorizer.transform([sample])
    sample = pca.transform(sample.toarray())
    prob_array = lr.predict_proba(sample)
    prob_df = pd.DataFrame(prob_array, index=['Probability'], columns=category_list).T
    prob_df = prob_df.sort_values(by='Probability', ascending=False)
    return prob_df

In [25]:
sample = 'My heart is beating fast and I have pain in my chest'
sample = lemmatize_text(sample)
sample = clean_text(sample)
sample = vectorizer.transform([sample])
sample = pca.transform(sample.toarray())
sample

array([[ 6.62437373e-02,  8.70115132e-03,  1.33169231e-01,
         6.52219711e-03, -7.92291789e-02, -3.78873601e-02,
        -1.23757525e-02, -9.16603387e-03,  4.41445611e-02,
         1.32172361e-01,  6.31916868e-02,  9.03447229e-02,
         1.08195971e-02, -8.55883079e-02,  5.55792166e-02,
         5.21243906e-02,  7.62554003e-02,  1.31871610e-02,
        -4.48038399e-02, -1.87052122e-01, -4.08183221e-02,
         1.42240020e-01, -5.21270689e-02, -1.04256054e-02,
         1.06589725e-01, -1.21761904e-01,  2.34237743e-01,
        -4.51387990e-02, -8.73822572e-04, -1.68867153e-01,
         7.10848529e-02, -4.77374408e-02, -8.96081274e-03,
         8.44615631e-02,  6.38032394e-02, -1.83272090e-02,
        -6.30535452e-02,  3.15095472e-03,  1.47555052e-02,
         2.93582218e-02, -1.03476614e-02, -3.32065312e-02,
         3.86234753e-02, -5.18095788e-02,  1.55240483e-02,
         6.78599619e-02,  4.16129786e-02, -7.96199642e-02,
        -4.66461036e-02, -2.24610690e-02,  5.62661781e-0

In [19]:
# Predict probabilities for a new sample
sample = 'My heart is beating fast and I have pain in my chest'
predict_category(sample)

Unnamed: 0,Probability
Cardiovascular / Pulmonary,0.509004
Obstetrics / Gynecology,0.243868
Gastroenterology,0.076381
Orthopedic,0.034537
Urology,0.031957
SOAP / Chart / Progress Notes,0.029296
Surgery,0.025919
Radiology,0.024154
Neurology,0.008656
General Medicine,0.008636


In [57]:
sample_2 = "Multiple views of the heart and great vessels reveal normal intracardiac and great vessel relationships."
predict_category(sample_2)

Unnamed: 0,Probability
Obstetrics / Gynecology,0.646012
Cardiovascular / Pulmonary,0.138018
Surgery,0.096524
Gastroenterology,0.036741
Urology,0.021423
SOAP / Chart / Progress Notes,0.019092
Orthopedic,0.017372
General Medicine,0.009357
Neurology,0.006584
Radiology,0.004471


In [38]:
# save the model
import pickle
pickle.dump(lr, open('model_lr.pkl', 'wb'))