In [66]:
from array import array
import pandas as pd
import numpy as np
import pickle
import os
import imblearn
import sklearn
import scipy

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbPipeline

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from traitlets import List

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE

from imblearn.over_sampling import SMOTE
from lime.lime_text import LimeTextExplainer

In [20]:
# load data
df = pd.read_csv('../data/processed/mtsamples_nlp.csv')

In [21]:
training_data, testing_data = train_test_split(df, test_size=0.2, random_state=42)

In [41]:
def build_pipeline() -> imblearn.pipeline.Pipeline:
    """
    Build pipeline for model
    """
    model_pipeline = imbPipeline(
        [
            ("preprocessing", CountVectorizer()),
            ("smote", SMOTE(random_state=42)),
            (
                "classifier",
                LogisticRegression(
                    random_state=42, penalty="l1", multi_class="multinomial", solver='saga', C=1
                ),
            ),  # remainder="passthrough"
        ]
    )
    return model_pipeline

In [42]:
model_pipeline = build_pipeline()

### Without grid search

In [43]:
model = model_pipeline.fit(training_data.transcription_f, training_data.medical_specialty)



In [44]:
# accuracy @k
y_preb_probs = model.predict_proba(testing_data.transcription_f)
top = np.argsort(y_preb_probs, axis=1)[:, -3:]
top = np.apply_along_axis(lambda x: model.classes_[x], 1, top)
actual = np.array(testing_data.medical_specialty).reshape(-1, 1)
np.any(top == actual, axis=1).mean()

0.8456375838926175

In [51]:
y_preb_probs = model.predict_proba(testing_data.transcription_f)
top = np.argsort(y_preb_probs, axis=1)[:, -3:]
top = np.apply_along_axis(lambda x: model.classes_[x], 1, top)
actual = np.array(testing_data.medical_specialty).reshape(-1, 1)
np.any(top == actual, axis=1).mean()

0.8456375838926175

### With grid search

In [45]:
def grid_search(
    X_train: pd.core.series.Series,
    y_train: list,
    model_pipeline: imblearn.pipeline.Pipeline,
    param_grid: list,
) -> imblearn.pipeline.Pipeline:
    """
    Grid search for best model

    Parameters
    ----------
    X_train : pd.core.series.Series
        train data
    y_train : list
        train labels
    model_pipeline : imblearn.pipeline.Pipeline
        pipeline for model
    param_grid : list
        list of parameters for grid search

    Returns
    -------
    imblearn.pipeline.Pipeline
        best model
    """
    search = GridSearchCV(model_pipeline, param_grid, cv=5)
    search.fit(X_train, y_train)
    print("Best parameters:", search.best_params_)
    return search.best_estimator_

In [None]:
def custom_accuracy_function(model, X_test, y_test, k):
    """
    Custom scorer with accuracy @k

    Parameters
    ----------
    model : imblearn.pipeline.Pipeline
        pipeline for model
    X_test: pd.core.series.Series
        train data
    y_test: list
        train labels

    Returns
    -------
    float
        accuracy @k
    """
    y_preb_probs = model.predict_proba(X_test)
    top = np.argsort(y_preb_probs, axis=1)[:, -k:]
    top = np.apply_along_axis(lambda x: model.classes_[x], 1, top)
    actual = np.array(y_test).reshape(-1, 1)
    return np.any(top == actual, axis=1).mean()

In [46]:
param_grid = [
        {
            "classifier__C": [0.01, 0.1, 1, 10],
        }
    ]

best_model = grid_search(
    training_data.transcription_f,
    training_data.medical_specialty,
    model_pipeline,
    param_grid,
)



Best parameters: {'classifier__C': 0.1}




In [47]:
y_preb_probs = best_model.predict_proba(testing_data.transcription_f)
top = np.argsort(y_preb_probs, axis=1)[:, -3:]
top = np.apply_along_axis(lambda x: model.classes_[x], 1, top)
actual = np.array(testing_data.medical_specialty).reshape(-1, 1)
np.any(top == actual, axis=1).mean()

0.7835570469798657

In [25]:
def predict_probability(model: imblearn.pipeline.Pipeline, value: str) -> pd.DataFrame:
    """
    get probabilities for sample
    Parameters
    ----------
    model : imblearn.pipeline.Pipeline
        best model from train.py
    value : str
        sample
    category_list: list[str]
        list of unique labels
    Returns
    -------
    pd.DataFrame
        Probabilities for labels
    """

    prob_array = model.predict_proba(value)
    prob_df = (
        pd.DataFrame(prob_array, index=["Probability"], columns=model.classes_)
        .transpose()
        .sort_values(by="Probability", ascending=False)
    )
    return prob_df

In [26]:
# Predict probability
to_pred = "coronary nitroglycerin muscle heart breast oxygen valve artery"
res_df = predict_probability(model, [to_pred])
print(res_df)

                                Probability
 Cardiovascular / Pulmonary        0.725316
 Radiology                         0.153696
 General Medicine                  0.088011
 Obstetrics / Gynecology           0.016509
 Consult - History and Phy.        0.011008
 Surgery                           0.004094
 Neurology                         0.000660
 Gastroenterology                  0.000326
 SOAP / Chart / Progress Notes     0.000313
 Urology                           0.000049
 Orthopedic                        0.000018


In [27]:
def lime_explainer(model: imblearn.pipeline.Pipeline, value: str):
    """
    Get features the model used for top predicted classes

    Parameters
    ----------
    model : imblearn.pipeline.Pipeline
        best model from train.py
    value : str
        sample

    Returns
    -------
    dict
        Features from sample the model used to predict classes
    """
    explainer = LimeTextExplainer(class_names=model.classes_)
    num_features = len(value.split())
    exp = explainer.explain_instance(
        value, model.predict_proba, num_features=num_features, top_labels=3
    )
    feat_importance = exp.as_map()
    feat_importance = {model.classes_[k]: v for k, v in feat_importance.items()}
    feat_importance = {
        k: [(value.split()[i], v) for i, v in v] for k, v in feat_importance.items()
    }
    feat_importance_pos = {
        k: [v for v in v if v[1] > 0] for k, v in feat_importance.items()
    }
    return feat_importance_pos

In [14]:
# features the model used for top predicted classes
feat_importance = lime_explainer(model, to_pred)
print(feat_importance)

{' General Medicine': [('breast', 0.08721571703500162), ('muscle', 0.07988169304133039), ('heart', 0.03135017615158257), ('oxygen', 0.030225364661143464)], ' Radiology': [('artery', 0.14760497121135482), ('nitroglycerin', 0.07510531220124371), ('muscle', 0.043301535530355345)], ' Cardiovascular / Pulmonary': [('oxygen', 0.25242070018155083), ('artery', 0.22901780568996302), ('heart', 0.21310866418142863), ('coronary', 0.19967080647956545), ('valve', 0.13323602638065268)]}


In [16]:
x = ' General Medicine'

# if key is in dict return value (first value in list)
def get_value(x):
    if x in feat_importance:
        return feat_importance[x]

In [18]:
value = get_value(x)

In [22]:
# only access words in list
words = [v[0] for v in value]
words

['breast', 'muscle', 'heart', 'oxygen']

In [23]:
# combine get_value and words into one function
def get_words(x):
    if x in feat_importance:
        value = feat_importance[x]
        words = [v[0] for v in value]
        return words
        

In [24]:
get_words(x)

['breast', 'muscle', 'heart', 'oxygen']

In [4]:
np.logspace(-4, 4, 20)

array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04])

In [16]:
explainer = LimeTextExplainer(class_names=model.classes_)
num_features = len(to_pred.split())
exp = explainer.explain_instance(
    to_pred, model.predict_proba, num_features=num_features, top_labels=3
)
feat_importance = exp.as_map()
feat_importance = {model.classes_[k]: v for k, v in feat_importance.items()}
feat_importance = {
    k: [(to_pred.split()[i], v) for i, v in v] for k, v in feat_importance.items()
}
feat_importance_pos = {
    k: [v for v in v if v[1] > 0] for k, v in feat_importance.items()
}

In [17]:
feat_importance

{' General Medicine': [('artery', -0.1869880324487356),
  ('breast', 0.0880783857407666),
  ('muscle', 0.07814815334554133),
  ('valve', -0.060311035364575015),
  ('heart', 0.03292414708991069),
  ('oxygen', 0.026857580194068784),
  ('nitroglycerin', -0.021849195996523633),
  ('coronary', -0.006822275744941747)],
 ' Radiology': [('oxygen', -0.19120402466101397),
  ('artery', 0.14181291398386572),
  ('nitroglycerin', 0.07048150048241235),
  ('muscle', 0.04382698549680477),
  ('coronary', -0.023307206099310092),
  ('heart', -0.022521716599489632),
  ('breast', -0.019836767093437305),
  ('valve', -0.008045373440971927)],
 ' Cardiovascular / Pulmonary': [('oxygen', 0.256451435035843),
  ('artery', 0.23452081596977184),
  ('heart', 0.21094362769758024),
  ('coronary', 0.1995309298470315),
  ('breast', -0.19744731236806046),
  ('muscle', -0.13769584310116986),
  ('valve', 0.1374394013956226),
  ('nitroglycerin', -0.065400545820238)]}

In [18]:
predict_probability(model, [to_pred])

Unnamed: 0,Probability
Cardiovascular / Pulmonary,0.725316
Radiology,0.153696
General Medicine,0.088011
Obstetrics / Gynecology,0.016509
Consult - History and Phy.,0.011008
Surgery,0.004094
Neurology,0.00066
Gastroenterology,0.000326
SOAP / Chart / Progress Notes,0.000313
Urology,4.9e-05


In [None]:
# how to explain lime output
# https://stackoverflow.com/questions/53895007/how-to-interpret-lime-output

### New mtsamples NLP data

In [313]:
# load new data 
df_new = pd.read_csv('../data/processed/nlp/mtsamples/mtsamples_unsupervised_both.csv')

In [314]:
# remove medical specialties that appear less than 10 times in dataset
#df_new = df_new.groupby('medical_specialty').filter(lambda x: len(x) > 10)
df_new.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords,keywords_list,location,keywords_outcome_weights,keywords_outcome_weights_unsupervised,transcription_f_unsupervised,transcription_f_semisupervised
0,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,mmode leave atrial enlargement leave atrial di...,"cardiovascular / pulmonary, 2-d m-mode, dopple...","['cardiovascular / pulmonary', ' 2-d m-mode', ...","dict_values([[221, 233], [11, 29], [163, 181],...","[('lv diastolic', 0.7221), ('ventricular eject...","[('pericardial effusion', 0.7809), ('function ...","['pericardial effusion', 'function pericardial...","['[', '(', ""'"", 'l', 'v', ' ', 'd', 'i', 'a', ..."
1,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,leave ventricular cavity size wall thickness a...,"cardiovascular / pulmonary, 2-d, doppler, echo...","['cardiovascular / pulmonary', ' 2-d', ' doppl...","dict_values([[409, 416], [680, 687], [506, 517...","[('consistent hyperdynamic', 0.7339), ('estima...","[('ventricular systolic', 0.8047), ('ventricul...","['ventricular systolic', 'ventricular outflow'...","['[', '(', ""'"", 'c', 'o', 'n', 's', 'i', 's', ..."
2,7,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 3,echocardiogrammultiple view heart great vessel...,"cardiovascular / pulmonary, 2-d echocardiogram...","['cardiovascular / pulmonary', ' 2-d echocardi...","dict_values([[101, 117], [206, 213], [453, 465...","[('echocardiogrammultiple', 0.741), ('pulmonar...","[('hypertrophy pericardial', 0.806), ('normal ...","['hypertrophy pericardial', 'normal intracardi...","['[', '(', ""'"", 'e', 'c', 'h', 'o', 'c', 'a', ..."
3,9,Echocardiogram and Doppler,Cardiovascular / Pulmonary,2-D Echocardiogram - 4,description normal cardiac chamber size normal...,"cardiovascular / pulmonary, ejection fraction,...","['cardiovascular / pulmonary', ' ejection frac...","dict_values([[97, 114], [76, 96], [282, 295], ...","[('regurgitationimpression normal', 0.6944), (...","[('tricuspid regurgitationimpression', 0.7798)...","['tricuspid regurgitationimpression', 'pericar...","['[', '(', ""'"", 'r', 'e', 'g', 'u', 'r', 'g', ..."
4,11,"Normal left ventricle, moderate biatrial enla...",Cardiovascular / Pulmonary,2-D Doppler,study mild aortic stenosis widely calcify mini...,"cardiovascular / pulmonary, 2-d study, doppler...","['cardiovascular / pulmonary', ' 2-d study', '...","dict_values([[334, 357], [17, 25], [71, 82], [...","[('restrict mild', 0.6688), ('ventricle modera...","[('ventricular hypertrophy', 0.8169), ('aortic...","['ventricular hypertrophy', 'aortic regurgitat...","['[', '(', ""'"", 'r', 'e', 's', 't', 'r', 'i', ..."


In [316]:
def keywords_without_weights(df):
    df["keywords_outcome_weights"] = df["keywords_outcome_weights"].apply(lambda x: ast.literal_eval(x))
    df["transcription_f_semisupervised"] = df["keywords_outcome_weights"].apply(lambda x: [i[0] for i in x])
    return df

df_new = keywords_without_weights(df_new)
df_new.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords,keywords_list,location,keywords_outcome_weights,keywords_outcome_weights_unsupervised,transcription_f_unsupervised,transcription_f_semisupervised
0,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,mmode leave atrial enlargement leave atrial di...,"cardiovascular / pulmonary, 2-d m-mode, dopple...","['cardiovascular / pulmonary', ' 2-d m-mode', ...","dict_values([[221, 233], [11, 29], [163, 181],...","[(lv diastolic, 0.7221), (ventricular ejection...","[('pericardial effusion', 0.7809), ('function ...","['pericardial effusion', 'function pericardial...","[lv diastolic, ventricular ejection, effusion ..."
1,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,leave ventricular cavity size wall thickness a...,"cardiovascular / pulmonary, 2-d, doppler, echo...","['cardiovascular / pulmonary', ' 2-d', ' doppl...","dict_values([[409, 416], [680, 687], [506, 517...","[(consistent hyperdynamic, 0.7339), (estimate ...","[('ventricular systolic', 0.8047), ('ventricul...","['ventricular systolic', 'ventricular outflow'...","[consistent hyperdynamic, estimate ejection, v..."
2,7,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 3,echocardiogrammultiple view heart great vessel...,"cardiovascular / pulmonary, 2-d echocardiogram...","['cardiovascular / pulmonary', ' 2-d echocardi...","dict_values([[101, 117], [206, 213], [453, 465...","[(echocardiogrammultiple, 0.741), (pulmonary o...","[('hypertrophy pericardial', 0.806), ('normal ...","['hypertrophy pericardial', 'normal intracardi...","[echocardiogrammultiple, pulmonary outflow, co..."
3,9,Echocardiogram and Doppler,Cardiovascular / Pulmonary,2-D Echocardiogram - 4,description normal cardiac chamber size normal...,"cardiovascular / pulmonary, ejection fraction,...","['cardiovascular / pulmonary', ' ejection frac...","dict_values([[97, 114], [76, 96], [282, 295], ...","[(regurgitationimpression normal, 0.6944), (go...","[('tricuspid regurgitationimpression', 0.7798)...","['tricuspid regurgitationimpression', 'pericar...","[regurgitationimpression normal, good motion, ..."
4,11,"Normal left ventricle, moderate biatrial enla...",Cardiovascular / Pulmonary,2-D Doppler,study mild aortic stenosis widely calcify mini...,"cardiovascular / pulmonary, 2-d study, doppler...","['cardiovascular / pulmonary', ' 2-d study', '...","dict_values([[334, 357], [17, 25], [71, 82], [...","[(restrict mild, 0.6688), (ventricle moderate,...","[('ventricular hypertrophy', 0.8169), ('aortic...","['ventricular hypertrophy', 'aortic regurgitat...","[restrict mild, ventricle moderate, moderate b..."


In [317]:
# save as new csv
df_new.to_csv('../data/processed/nlp/mtsamples/mtsamples_unsupervised_both_v2.csv', index=False)

In [332]:
df_new = pd.read_csv('../data/processed/nlp/mtsamples/mtsamples_unsupervised_both_v2.csv')

In [333]:
df_new.shape

(2947, 12)

In [334]:
df_new.transcription_f_unsupervised[0]

"['pericardial effusion', 'function pericardial', 'ventricular ejection', 'atrial enlargement', 'leave ventricular', 'systolic pressure', 'aortic pulmonary', 'mitral tricuspid', 'pulmonary regurgitation', 'pressure mmhgdoppler', 'valve pa', 'ejection fraction', 'leave atrial', 'lv diastolic', 'mmode leave']"

In [335]:
df_new.transcription_f_semisupervised[0]

"['lv diastolic', 'ventricular ejection', 'effusion normal', 'valve pulmonary', 'mild']"

In [336]:
import ast

In [337]:
def replace_tab(x: str) -> str:
    return [i.replace(" ", "_") for i in x]


In [338]:
def transform_column(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """
    Transform column to list

    Parameters
    ----------
    df : pd.DataFrame
        dataframe with labels and NLP features
    column_name : str
        column name

    Returns
    -------
    pd.DataFrame
        dataframe with transformed column
    """
    df[column_name] = df[column_name].apply(lambda x: ast.literal_eval(x))
    df[column_name] = df[column_name].apply(lambda x: replace_tab(x))
    return df

In [339]:
df_new = transform_column(df_new, 'transcription_f_unsupervised')
df_new.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords,keywords_list,location,keywords_outcome_weights,keywords_outcome_weights_unsupervised,transcription_f_unsupervised,transcription_f_semisupervised
0,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,mmode leave atrial enlargement leave atrial di...,"cardiovascular / pulmonary, 2-d m-mode, dopple...","['cardiovascular / pulmonary', ' 2-d m-mode', ...","dict_values([[221, 233], [11, 29], [163, 181],...","[('lv diastolic', 0.7221), ('ventricular eject...","[('pericardial effusion', 0.7809), ('function ...","[pericardial_effusion, function_pericardial, v...","['lv diastolic', 'ventricular ejection', 'effu..."
1,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,leave ventricular cavity size wall thickness a...,"cardiovascular / pulmonary, 2-d, doppler, echo...","['cardiovascular / pulmonary', ' 2-d', ' doppl...","dict_values([[409, 416], [680, 687], [506, 517...","[('consistent hyperdynamic', 0.7339), ('estima...","[('ventricular systolic', 0.8047), ('ventricul...","[ventricular_systolic, ventricular_outflow, le...","['consistent hyperdynamic', 'estimate ejection..."
2,7,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 3,echocardiogrammultiple view heart great vessel...,"cardiovascular / pulmonary, 2-d echocardiogram...","['cardiovascular / pulmonary', ' 2-d echocardi...","dict_values([[101, 117], [206, 213], [453, 465...","[('echocardiogrammultiple', 0.741), ('pulmonar...","[('hypertrophy pericardial', 0.806), ('normal ...","[hypertrophy_pericardial, normal_intracardiac,...","['echocardiogrammultiple', 'pulmonary outflow'..."
3,9,Echocardiogram and Doppler,Cardiovascular / Pulmonary,2-D Echocardiogram - 4,description normal cardiac chamber size normal...,"cardiovascular / pulmonary, ejection fraction,...","['cardiovascular / pulmonary', ' ejection frac...","dict_values([[97, 114], [76, 96], [282, 295], ...","[('regurgitationimpression normal', 0.6944), (...","[('tricuspid regurgitationimpression', 0.7798)...","[tricuspid_regurgitationimpression, pericardia...","['regurgitationimpression normal', 'good motio..."
4,11,"Normal left ventricle, moderate biatrial enla...",Cardiovascular / Pulmonary,2-D Doppler,study mild aortic stenosis widely calcify mini...,"cardiovascular / pulmonary, 2-d study, doppler...","['cardiovascular / pulmonary', ' 2-d study', '...","dict_values([[334, 357], [17, 25], [71, 82], [...","[('restrict mild', 0.6688), ('ventricle modera...","[('ventricular hypertrophy', 0.8169), ('aortic...","[ventricular_hypertrophy, aortic_regurgitation...","['restrict mild', 'ventricle moderate', 'moder..."


In [357]:
keywords = df_new.transcription_f_unsupervised.to_list()
keywords = [" ".join(i) for i in keywords]
keywords[0]

'pericardial_effusion function_pericardial ventricular_ejection atrial_enlargement leave_ventricular systolic_pressure aortic_pulmonary mitral_tricuspid pulmonary_regurgitation pressure_mmhgdoppler valve_pa ejection_fraction leave_atrial lv_diastolic mmode_leave'

In [326]:
def func(x): 
    return x

In [356]:
cv = CountVectorizer()
transcription = cv.fit_transform(keywords)
transcription

<2947x19938 sparse matrix of type '<class 'numpy.int64'>'
	with 44007 stored elements in Compressed Sparse Row format>

In [328]:
type(func)

function

In [329]:
# apply smote to new data
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(transcription, df_new.medical_specialty)

In [330]:
df_new.medical_specialty.shape

(2947,)

In [331]:
transcription.shape

(2947, 19938)