In [65]:
## Import Libraries

import pandas as pd
import numpy as np
import string
import re
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline as skPipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE    

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from imblearn.over_sampling import SMOTE

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hannahpetry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hannahpetry/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hannahpetry/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/hannahpetry/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [118]:
# load data
df = pd.read_csv('../data/raw/mtsamples.csv')
df.transcription=df.transcription.astype(str)

In [67]:
# remove rows with missing values
def clean_df(data):
    df = data.dropna().drop_duplicates() 
    # drop rows where medical specialty appears less than 100 times
    df = df[df.groupby("medical_specialty")["medical_specialty"].transform('size') > 100]
    print("Number of rows after removing medical specialties with less than 100 samples:", len(df.index))
    # remove unnecessary columns, only keep transcriptions and medical_specialty columns
    return df[['transcription', 'medical_specialty']]


# remove punctuation and lowercase and lemmatizer
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def remove_punct_lower(data):
    '''remove punctuation and lowercase'''
    data["transcription_c"] = data["transcription"].apply(lambda x: x.lower().translate(str.maketrans('','', string.punctuation)))
    return data

def lemmatize_words(data):
    '''lemmatize words, remove stopwords'''
    data["transcription_c"] = data["transcription_c"].apply(lambda x: [lemmatizer.lemmatize(x) for x in word_tokenize(x) if x not in (stop)])
    return data
 


#apply on dataset
df_m = clean_df(df)
df_test = remove_punct_lower(df_m)
df_test = lemmatize_words(df_test)

Number of rows after removing medical specialties with less than 100 samples: 2976


In [68]:
# NLP with Spacy
import spacy
import en_ner_bionlp13cg_md
nlp = en_ner_bionlp13cg_md.load()
def medical_entities( text):
    entities = []
    doc = nlp(text)
    for ent in doc.ents:
        entities.append(ent.text)
    return set(entities)#' ,'.join(entities)


df_test['transcription_c'] = [','.join(map(str, l)) for l in df_test['transcription_c']]
df_test['transcription_f'] = df_test['transcription_c'].apply(medical_entities)
df_test.head()

Unnamed: 0,transcription,medical_specialty,transcription_c,transcription_f
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,"2d,mmode,1,left,atrial,enlargement,left,atrial...","{pulmonary, ventricular, left, valve, mitral}"
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,"1,left,ventricular,cavity,size,wall,thickness,...","{pulmonary, lipomatous, leaflet, ventricular, ..."
7,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary,"2d,echocardiogrammultiple,view,heart,great,ves...","{pulmonary, arch, coronary, inflow, aorta, lef..."
9,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary,"description1,normal,cardiac,chamber,size2,norm...","{ventricular, left, valve, cardiac, mitral}"
11,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary,"2d,study1,mild,aortic,stenosis,widely,calcifie...","{ventricular, left, heart, ventricle, mitral}"


In [83]:
# retrieve labels as function
def get_labels(data):
    return data['medical_specialty'].tolist()

df_test_label = get_labels(df_test)

df_test_X = df_test['transcription_f'].astype(str)


In [87]:
# split data into train and test set 
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df_test_X, df_test_label)

In [101]:
model_pipeline = imbPipeline([
        ('preprocessing',CountVectorizer()),
        #('svd', TruncatedSVD(n_components=100)),
        ('smote', SMOTE(random_state=42)),
        ('classifier', LogisticRegression(random_state=42, multi_class='multinomial')), # remainder="passthrough"
])

In [102]:
lr = model_pipeline.fit(X_train, y_train)
y_pred = lr.predict(X_test)
category_list = df_test.medical_specialty.unique()
print(classification_report(y_test, y_pred, target_names=category_list))


                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.23      0.25      0.24        52
                       Urology       0.16      0.15      0.15        40
              General Medicine       0.25      0.33      0.29        42
                       Surgery       0.05      0.08      0.06        25
 SOAP / Chart / Progress Notes       0.04      0.04      0.04        23
                     Radiology       0.23      0.33      0.27        30
                    Orthopedic       0.12      0.16      0.14        57
       Obstetrics / Gynecology       0.07      0.06      0.06        51
                     Neurology       0.15      0.14      0.14        35
              Gastroenterology       0.35      0.26      0.30       212
    Consult - History and Phy.       0.21      0.24      0.23        29

                      accuracy                           0.21       596
                     macro avg       0.17      0.19      0.18 

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [103]:
# take best model from grid search and perform evaluation

param_grid = [
    { 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
      'classifier': [LogisticRegression(multi_class='multinomial', random_state=42)],
      'classifier__solver': ['saga', 'lbfgs', 'liblinear'],
      'classifier__penalty': ['none', 'l1', 'l2', 'elasticnet'],
    }
]

def grid_search(X_train, y_train, model_pipeline, param_grid):
    search = GridSearchCV(model_pipeline, param_grid, cv=5)
    search.fit(X_train, y_train)
    print("Best parameter", search.best_params_)
    return search.best_estimator_
    
best_model = grid_search(X_train, y_train, model_pipeline, param_grid)
y_pred = best_model.predict(X_test)

category_list = df_test.medical_specialty.unique()
# predict and evaluate
print(classification_report(y_test, y_pred, target_names=category_list))

Setting penalty='none' will ignore the C and l1_ratio parameters
The max_iter was reached which means the coef_ did not converge
Setting penalty='none' will ignore the C and l1_ratio parameters
The max_iter was reached which means the coef_ did not converge
Setting penalty='none' will ignore the C and l1_ratio parameters
The max_iter was reached which means the coef_ did not converge
Setting penalty='none' will ignore the C and l1_ratio parameters
The max_iter was reached which means the coef_ did not converge
Setting penalty='none' will ignore the C and l1_ratio parameters
The max_iter was reached which means the coef_ did not converge
Setting penalty='none' will ignore the C and l1_ratio parameters
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver option

Best parameter {'classifier': LogisticRegression(C=0.001, multi_class='multinomial', random_state=42,
                   solver='saga'), 'classifier__C': 0.001, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.45      0.56      0.50        52
                       Urology       0.40      0.20      0.27        40
              General Medicine       0.42      0.45      0.44        42
                       Surgery       0.17      0.16      0.17        25
 SOAP / Chart / Progress Notes       0.17      0.13      0.15        23
                     Radiology       0.33      0.60      0.42        30
                    Orthopedic       0.32      0.40      0.35        57
       Obstetrics / Gynecology       0.16      0.10      0.12        51
                     Neurology       0.16      0.23      0.19        35
              Gastroenterology       0.53      0.47      0.

The max_iter was reached which means the coef_ did not converge


In [136]:
# Prediction for sample from test set
sample = X_test.iloc[13]
print("Prediction:", best_model.predict([sample]))
# Actual category of first sample from test set
print("Actual:", y_test[13])

Prediction: [' Obstetrics / Gynecology']
Actual:  Obstetrics / Gynecology


In [137]:
sample

"{'lip', 'abdomen', 'needle', 'endometrial', 'wall', 'anterior', 'cervix', 'tissue', 'fallopian', 'decidual', 'patient', 'tube', 'unclotted', 'liver', 'umbilicus', 'mesosalpinx', 'saline', 'vulsellum', 'vagina', 'left', 'midline', 'curettings', 'clot', 'fundus', 'peritoneum', 'omental', 'vesicouterine', 'uterus', 'uterine', 'blood'}"

In [139]:
# Predict probabilities for a sample from test set
prob_array = best_model.predict_proba(X_test)[13,:]
prob_df = pd.DataFrame(prob_array, index=category_list, columns=['Probability']).sort_values(by='Probability', ascending=False)
prob_df

Unnamed: 0,Probability
Radiology,0.257612
Gastroenterology,0.167588
Cardiovascular / Pulmonary,0.082872
General Medicine,0.076178
Orthopedic,0.070553
Obstetrics / Gynecology,0.062319
Consult - History and Phy.,0.059928
SOAP / Chart / Progress Notes,0.057917
Urology,0.057241
Neurology,0.056038


In [142]:
X_train_df = X_train.to_frame()

vectorizer = CountVectorizer()

bowmatrix = vectorizer.fit_transform(X_train_df["transcription_f"])

feat_df = pd.DataFrame(bowmatrix.toarray(), columns=vectorizer.get_feature_names())
feat = column_names = list(feat_df.columns)
len(feat)

Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


2574

In [117]:
import lime
import lime.lime_tabular
explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=vectorizer.get_feature_names(), class_names=category_list)
# num features is the number of features to be shown
# top lables is the number of labels with the highest probability to be shown
exp = explainer.explain_instance(X_test[4], lr.predict_proba, num_features=5, top_labels=2)
exp.show_in_notebook(show_table=True, show_all=False)


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


IndexError: tuple index out of range

In [99]:
import shap
from pydoc import classname

clf = LogisticRegression(random_state=42, multi_class='multinomial', penalty='l2', solver='lbfgs', C=0.01)
explainer = shap.LinearExplainer(clf, X_train, feature_dependence="independent")
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, class_names= category_list, feature_names=vectorizer.get_feature_names())

The option feature_dependence has been renamed to feature_perturbation!
The option feature_perturbation="independent" is has been renamed to feature_perturbation="interventional"!
The feature_perturbation option is now deprecated in favor of using the appropriate masker (maskers.Independent, or maskers.Impute)


InvalidModelError: An unknown model type was passed: <class 'sklearn.linear_model._logistic.LogisticRegression'>