In [5]:
from array import array
import pandas as pd
import numpy as np
import pickle
import os
import imblearn
import sklearn
import scipy

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbPipeline

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from traitlets import List

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE

from imblearn.over_sampling import SMOTE
from lime.lime_text import LimeTextExplainer

In [6]:
# load data
df = pd.read_csv('../data/processed/mtsamples_nlp.csv')

In [7]:
training_data, testing_data = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
def build_pipeline() -> imblearn.pipeline.Pipeline:
    """
    Build pipeline for model
    """
    model_pipeline = imbPipeline(
        [
            ("preprocessing", CountVectorizer()),
            ("smote", SMOTE(random_state=42)),
            (
                "classifier",
                LogisticRegression(
                    random_state=42,
                ),
            ),  # remainder="passthrough"
        ]
    )
    return model_pipeline

In [9]:
model_pipeline = build_pipeline()

In [10]:
model = model_pipeline.fit(training_data.transcription_f, training_data.medical_specialty)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
def predict_probability(model: imblearn.pipeline.Pipeline, value: str) -> pd.DataFrame:
    """
    get probabilities for sample
    Parameters
    ----------
    model : imblearn.pipeline.Pipeline
        best model from train.py
    value : str
        sample
    category_list: list[str]
        list of unique labels
    Returns
    -------
    pd.DataFrame
        Probabilities for labels
    """

    prob_array = model.predict_proba(value)
    prob_df = (
        pd.DataFrame(prob_array, index=["Probability"], columns=model.classes_)
        .transpose()
        .sort_values(by="Probability", ascending=False)
    )
    return prob_df

In [12]:
# Predict probability
to_pred = "coronary nitroglycerin muscle heart breast oxygen valve artery"
res_df = predict_probability(model, [to_pred])
print(res_df)

                                Probability
 Cardiovascular / Pulmonary        0.725316
 Radiology                         0.153696
 General Medicine                  0.088011
 Obstetrics / Gynecology           0.016509
 Consult - History and Phy.        0.011008
 Surgery                           0.004094
 Neurology                         0.000660
 Gastroenterology                  0.000326
 SOAP / Chart / Progress Notes     0.000313
 Urology                           0.000049
 Orthopedic                        0.000018


In [13]:
def lime_explainer(model: imblearn.pipeline.Pipeline, value: str):
    """
    Get features the model used for top predicted classes

    Parameters
    ----------
    model : imblearn.pipeline.Pipeline
        best model from train.py
    value : str
        sample

    Returns
    -------
    dict
        Features from sample the model used to predict classes
    """
    explainer = LimeTextExplainer(class_names=model.classes_)
    num_features = len(value.split())
    exp = explainer.explain_instance(
        value, model.predict_proba, num_features=num_features, top_labels=3
    )
    feat_importance = exp.as_map()
    feat_importance = {model.classes_[k]: v for k, v in feat_importance.items()}
    feat_importance = {
        k: [(value.split()[i], v) for i, v in v] for k, v in feat_importance.items()
    }
    feat_importance_pos = {
        k: [v for v in v if v[1] > 0] for k, v in feat_importance.items()
    }
    return feat_importance_pos

In [14]:
# features the model used for top predicted classes
feat_importance = lime_explainer(model, to_pred)
print(feat_importance)

{' General Medicine': [('breast', 0.08721571703500162), ('muscle', 0.07988169304133039), ('heart', 0.03135017615158257), ('oxygen', 0.030225364661143464)], ' Radiology': [('artery', 0.14760497121135482), ('nitroglycerin', 0.07510531220124371), ('muscle', 0.043301535530355345)], ' Cardiovascular / Pulmonary': [('oxygen', 0.25242070018155083), ('artery', 0.22901780568996302), ('heart', 0.21310866418142863), ('coronary', 0.19967080647956545), ('valve', 0.13323602638065268)]}


In [16]:
x = ' General Medicine'

# if key is in dict return value (first value in list)
def get_value(x):
    if x in feat_importance:
        return feat_importance[x]

In [18]:
value = get_value(x)

In [22]:
# only access words in list
words = [v[0] for v in value]
words

['breast', 'muscle', 'heart', 'oxygen']

In [23]:
# combine get_value and words into one function
def get_words(x):
    if x in feat_importance:
        value = feat_importance[x]
        words = [v[0] for v in value]
        return words
        

In [24]:
get_words(x)

['breast', 'muscle', 'heart', 'oxygen']

In [4]:
np.logspace(-4, 4, 20)

array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04])

In [16]:
explainer = LimeTextExplainer(class_names=model.classes_)
num_features = len(to_pred.split())
exp = explainer.explain_instance(
    to_pred, model.predict_proba, num_features=num_features, top_labels=3
)
feat_importance = exp.as_map()
feat_importance = {model.classes_[k]: v for k, v in feat_importance.items()}
feat_importance = {
    k: [(to_pred.split()[i], v) for i, v in v] for k, v in feat_importance.items()
}
feat_importance_pos = {
    k: [v for v in v if v[1] > 0] for k, v in feat_importance.items()
}

In [17]:
feat_importance

{' General Medicine': [('artery', -0.1869880324487356),
  ('breast', 0.0880783857407666),
  ('muscle', 0.07814815334554133),
  ('valve', -0.060311035364575015),
  ('heart', 0.03292414708991069),
  ('oxygen', 0.026857580194068784),
  ('nitroglycerin', -0.021849195996523633),
  ('coronary', -0.006822275744941747)],
 ' Radiology': [('oxygen', -0.19120402466101397),
  ('artery', 0.14181291398386572),
  ('nitroglycerin', 0.07048150048241235),
  ('muscle', 0.04382698549680477),
  ('coronary', -0.023307206099310092),
  ('heart', -0.022521716599489632),
  ('breast', -0.019836767093437305),
  ('valve', -0.008045373440971927)],
 ' Cardiovascular / Pulmonary': [('oxygen', 0.256451435035843),
  ('artery', 0.23452081596977184),
  ('heart', 0.21094362769758024),
  ('coronary', 0.1995309298470315),
  ('breast', -0.19744731236806046),
  ('muscle', -0.13769584310116986),
  ('valve', 0.1374394013956226),
  ('nitroglycerin', -0.065400545820238)]}

In [18]:
predict_probability(model, [to_pred])

Unnamed: 0,Probability
Cardiovascular / Pulmonary,0.725316
Radiology,0.153696
General Medicine,0.088011
Obstetrics / Gynecology,0.016509
Consult - History and Phy.,0.011008
Surgery,0.004094
Neurology,0.00066
Gastroenterology,0.000326
SOAP / Chart / Progress Notes,0.000313
Urology,4.9e-05


In [None]:
# how to explain lime output
# https://stackoverflow.com/questions/53895007/how-to-interpret-lime-output