In [101]:
## Import Libraries

import pandas as pd
import numpy as np
import string
import re
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 

import warnings
warnings.filterwarnings("ignore")

In [102]:
## Load data
mtsamples_df = pd.read_csv("../data/raw/mtsamples.csv")

In [103]:
## Data Cleaning
mtsamples_df = mtsamples_df.dropna()
mtsamples_df = mtsamples_df.drop_duplicates() 

In [104]:
## Data Preprocessing
data_categories  = mtsamples_df.groupby(mtsamples_df['medical_specialty'])
# only use medical specialties with more than 100 samples
filtered_data_categories = data_categories.filter(lambda x:x.shape[0] > 100)
final_data_categories = filtered_data_categories.groupby(filtered_data_categories['medical_specialty'])
data = filtered_data_categories[['transcription', 'medical_specialty']]

In [105]:
# Feature Engineering functions for text (taken from Kaggle)
def clean_text(text ): 
    text = text.translate(str.maketrans('', '', string.punctuation))
    text1 = ''.join([w for w in text if not w.isdigit()]) 
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    #BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    
    text2 = text1.lower()
    text2 = REPLACE_BY_SPACE_RE.sub('', text2) # replace REPLACE_BY_SPACE_RE symbols by space in text
    #text2 = BAD_SYMBOLS_RE.sub('', text2)
    return text2

def lemmatize_text(text):
    wordlist=[]
    lemmatizer = WordNetLemmatizer() 
    sentences=sent_tokenize(text)
    
    intial_sentences= sentences[0:1]
    final_sentences = sentences[len(sentences)-2: len(sentences)-1]
    
    for sentence in intial_sentences:
        words=word_tokenize(sentence)
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))
    for sentence in final_sentences:
        words=word_tokenize(sentence)
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))       
    return ' '.join(wordlist) 

In [106]:
# Apply Feature Engineering (taken from Kaggle)
data['transcription'] = data['transcription'].apply(lemmatize_text)
data['transcription'] = data['transcription'].apply(clean_text)

In [107]:
# Vectorize text and create sparse matrix
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',ngram_range=(1,3), max_df=0.75, use_idf=True, smooth_idf=True, max_features=1000)
tfIdfMat  = vectorizer.fit_transform(data['transcription'].tolist() )
tfIdfMat

<3010x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 67838 stored elements in Compressed Sparse Row format>

In [108]:
# Create X and y and apply PCA to reduce dimensionality of features
import gc
gc.collect()
pca = PCA(n_components=0.95)
tfIdfMat_reduced = pca.fit_transform(tfIdfMat.toarray())
labels = data['medical_specialty'].tolist()
category_list = data.medical_specialty.unique()
labels

[' Cardiovascular / Pulmonary',
 ' Cardiovascular / Pulmonary',
 ' Cardiovascular / Pulmonary',
 ' Cardiovascular / Pulmonary',
 ' Cardiovascular / Pulmonary',
 ' Cardiovascular / Pulmonary',
 ' Urology',
 ' General Medicine',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Urology',
 ' Uro

In [109]:
# Create a class to select numerical or categorical columns 
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [None]:
#cat_attribs = tfIdfMat_reduced

#cat_pipeline = Pipeline([
#        ('selector', DataFrameSelector(cat_attribs)),
#        ('cat_encoder', OneHotEncoder(sparse=False)),])

In [None]:
#full_pipeline = FeatureUnion(transformer_list=[
#        ("cat_pipeline", cat_pipeline),])

In [None]:
#mt_features = full_pipeline.fit_transform(mtsamples_df)
#mt_features.shape    

In [110]:
# Split data into train and test

X_train, X_test, y_train, y_test = train_test_split(tfIdfMat_reduced, labels, test_size=0.2, random_state=42)

In [138]:
# build model
# default solver would be lbfgs, but it does not support l1_ratio which combines l1 and l2 regularization
clf = LogisticRegression(random_state=0, penalty= 'elasticnet', solver= 'saga', l1_ratio=0.5, multi_class='multinomial')
lr = Pipeline(steps=[('classifier', clf)])

In [139]:
# train model
lr.fit(X_train, y_train)

In [140]:
# predict and evaluate
y_pred = lr.predict(X_test)
print('Very poor performance of model with an accuracy of only 0.45 and very low f1-scores')
print(classification_report(y_test, y_pred, target_names=category_list))

Very poor performance of model with an accuracy of only 0.45 and very low f1-scores
                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.36      0.25      0.30        63
                       Urology       0.33      0.55      0.41        38
              General Medicine       0.31      0.10      0.15        42
                       Surgery       0.11      0.06      0.08        33
 SOAP / Chart / Progress Notes       0.40      0.17      0.24        35
                     Radiology       0.40      0.09      0.15        22
                    Orthopedic       0.31      0.15      0.21        52
       Obstetrics / Gynecology       0.27      0.32      0.29        38
                     Neurology       0.44      0.44      0.44        32
              Gastroenterology       0.55      0.85      0.67       216
    Consult - History and Phy.       0.40      0.06      0.11        31

                      accuracy                    

In [141]:
# finetune model and choosing best C and l1 ratio with grid search
from sklearn.model_selection import GridSearchCV
param_grid = {'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'classifier__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}
grid_search = GridSearchCV(lr, param_grid, cv=5)
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'classifier__C': 1, 'classifier__l1_ratio': 0.9}

In [142]:
clf = LogisticRegression(random_state=0, penalty= 'elasticnet', solver= 'saga', l1_ratio=0.9, multi_class='multinomial', C=1)
lr = Pipeline(steps=[('classifier', clf)]).fit(X_train, y_train)
# predict and evaluate
y_pred = lr.predict(X_test)
print('Model performance only improves slightly after finetuning')
print(classification_report(y_test, y_pred, target_names=category_list))

Model performance only improves slightly after finetuning
                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.41      0.30      0.35        63
                       Urology       0.32      0.55      0.41        38
              General Medicine       0.33      0.12      0.18        42
                       Surgery       0.21      0.12      0.15        33
 SOAP / Chart / Progress Notes       0.38      0.17      0.24        35
                     Radiology       0.29      0.09      0.14        22
                    Orthopedic       0.30      0.17      0.22        52
       Obstetrics / Gynecology       0.27      0.32      0.29        38
                     Neurology       0.43      0.38      0.40        32
              Gastroenterology       0.56      0.84      0.68       216
    Consult - History and Phy.       0.50      0.13      0.21        31

                      accuracy                           0.46       602
    

In [148]:
# applying some domain knowlegde to improve model performance
# show categories with count of samples
data_categories = data.groupby(data['medical_specialty'])
print('there are certain generic categories like surgery, SOAP, or Consult which are not very specific and the dataset is very unbalanced')
data_categories.count()

there are certain generic categories like surgery, SOAP, or Consult which are not very specific and the dataset is very unbalanced


Unnamed: 0_level_0,transcription
medical_specialty,Unnamed: 1_level_1
Cardiovascular / Pulmonary,280
Consult - History and Phy.,234
Gastroenterology,195
General Medicine,146
Neurology,168
Obstetrics / Gynecology,130
Orthopedic,303
Radiology,251
SOAP / Chart / Progress Notes,142
Surgery,1021


In [150]:
# balance out dataset with SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(tfIdfMat_reduced, labels)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('Model performance improves significantly after balancing dataset with SMOTE')
print('However some categories like Gastrientology are still not predicted well')
print(classification_report(y_test, y_pred, target_names=category_list))

Model performance improves significantly after balancing dataset with SMOTE
However some categories like Gastrientology are still not predicted well
                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.75      0.71      0.73       208
                       Urology       0.61      0.57      0.59       206
              General Medicine       0.76      0.82      0.79       214
                       Surgery       0.67      0.73      0.70       211
 SOAP / Chart / Progress Notes       0.63      0.68      0.65       209
                     Radiology       0.76      0.89      0.82       190
                    Orthopedic       0.70      0.65      0.67       197
       Obstetrics / Gynecology       0.62      0.56      0.59       204
                     Neurology       0.65      0.76      0.70       207
              Gastroenterology       0.50      0.30      0.38       201
    Consult - History and Phy.       0.79      0.88      0