In [10]:
from array import array
import pandas as pd
import numpy as np
import pickle
import os
import imblearn
import sklearn
import scipy

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbPipeline

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from traitlets import List

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE

from imblearn.over_sampling import SMOTE
from lime.lime_text import LimeTextExplainer

In [2]:
# load data
df = pd.read_csv('../data/processed/mtsamples_nlp.csv')

In [3]:
training_data, testing_data = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
def build_pipeline() -> imblearn.pipeline.Pipeline:
    """
    Build pipeline for model
    """
    model_pipeline = imbPipeline(
        [
            ("preprocessing", CountVectorizer()),
            ("smote", SMOTE(random_state=42)),
            (
                "classifier",
                LogisticRegression(
                    random_state=42,
                ),
            ),  # remainder="passthrough"
        ]
    )
    return model_pipeline

In [6]:
model_pipeline = build_pipeline()

In [7]:
model = model_pipeline.fit(training_data.transcription_f, training_data.medical_specialty)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
def predict_probability(model: imblearn.pipeline.Pipeline, value: str) -> pd.DataFrame:
    """
    get probabilities for sample
    Parameters
    ----------
    model : imblearn.pipeline.Pipeline
        best model from train.py
    value : str
        sample
    category_list: list[str]
        list of unique labels
    Returns
    -------
    pd.DataFrame
        Probabilities for labels
    """

    prob_array = model.predict_proba(value)
    prob_df = (
        pd.DataFrame(prob_array, index=["Probability"], columns=model.classes_)
        .transpose()
        .sort_values(by="Probability", ascending=False)
    )
    return prob_df

In [9]:
# Predict probability
to_pred = "coronary nitroglycerin muscle heart breast oxygen valve artery"
res_df = predict_probability(model, [to_pred])
print(res_df)

                                Probability
 Cardiovascular / Pulmonary        0.725316
 Radiology                         0.153696
 General Medicine                  0.088011
 Obstetrics / Gynecology           0.016509
 Consult - History and Phy.        0.011008
 Surgery                           0.004094
 Neurology                         0.000660
 Gastroenterology                  0.000326
 SOAP / Chart / Progress Notes     0.000313
 Urology                           0.000049
 Orthopedic                        0.000018


In [11]:
def lime_explainer(model: imblearn.pipeline.Pipeline, value: str):
    """
    Get features the model used for top predicted classes

    Parameters
    ----------
    model : imblearn.pipeline.Pipeline
        best model from train.py
    value : str
        sample

    Returns
    -------
    dict
        Features from sample the model used to predict classes
    """
    explainer = LimeTextExplainer(class_names=model.classes_)
    num_features = len(value.split())
    exp = explainer.explain_instance(
        value, model.predict_proba, num_features=num_features, top_labels=3
    )
    feat_importance = exp.as_map()
    feat_importance = {model.classes_[k]: v for k, v in feat_importance.items()}
    feat_importance = {
        k: [(value.split()[i], v) for i, v in v] for k, v in feat_importance.items()
    }
    feat_importance_pos = {
        k: [v for v in v if v[1] > 0] for k, v in feat_importance.items()
    }
    return feat_importance_pos

In [12]:
# features the model used for top predicted classes
feat_importance = lime_explainer(model, to_pred)
print(feat_importance)

{' General Medicine': [('breast', 0.08538663930853976), ('muscle', 0.08129614124637603), ('heart', 0.03257568466818079), ('oxygen', 0.027563602319390052)], ' Radiology': [('artery', 0.14826870849067347), ('nitroglycerin', 0.07722799931520544), ('muscle', 0.04454801614169169)], ' Cardiovascular / Pulmonary': [('oxygen', 0.2561777240885622), ('artery', 0.23414528182539765), ('heart', 0.21199254218646277), ('coronary', 0.20022344949631096), ('valve', 0.13659521118496687)]}


In [16]:
x = ' General Medicine'

# if key is in dict return value (first value in list)
def get_value(x):
    if x in feat_importance:
        return feat_importance[x]

In [18]:
value = get_value(x)

In [22]:
# only access words in list
words = [v[0] for v in value]
words

['breast', 'muscle', 'heart', 'oxygen']

In [23]:
# combine get_value and words into one function
def get_words(x):
    if x in feat_importance:
        value = feat_importance[x]
        words = [v[0] for v in value]
        return words
        

In [24]:
get_words(x)

['breast', 'muscle', 'heart', 'oxygen']