In [22]:
## Import Libraries

import pandas as pd
import numpy as np
import string
import re
import nltk
import imblearn

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE    

from imblearn.over_sampling import SMOTE

In [23]:
# load data
df_test = pd.read_csv('../data/processed/mtsamples_nlp.csv')
df_test.transcription=df_test.transcription.astype(str)
df_test.head()

Unnamed: 0,transcription,medical_specialty,transcription_c,transcription_f
0,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,"2d,mmode,1,left,atrial,enlargement,left,atrial...","{'pulmonary', 'ventricular', 'left', 'valve', ..."
1,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,"1,left,ventricular,cavity,size,wall,thickness,...","{'pulmonary', 'lipomatous', 'leaflet', 'ventri..."
2,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary,"2d,echocardiogrammultiple,view,heart,great,ves...","{'pulmonary', 'arch', 'coronary', 'inflow', 'a..."
3,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary,"description1,normal,cardiac,chamber,size2,norm...","{'ventricular', 'left', 'valve', 'cardiac', 'm..."
4,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary,"2d,study1,mild,aortic,stenosis,widely,calcifie...","{'ventricular', 'left', 'heart', 'ventricle', ..."


In [24]:
# retrieve labels as function
def get_labels(data):
    return data['medical_specialty'].tolist()

df_test_label = get_labels(df_test)

df_test_X = df_test['transcription_f'].astype(str)


In [25]:
# split data into train and test set (first split data into train and test set to only transform the train set)
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df_test_X, df_test_label)



In [15]:
# vectorize df_test_X for correlation matrix between features
vectorizer = CountVectorizer()
df_test_X_vec = vectorizer.fit_transform(df_test_X)
df_test_X_vec = df_test_X_vec.toarray()
print(df_test_X_vec.shape)

df_test_X_vec = pd.DataFrame(df_test_X_vec, columns=vectorizer.get_feature_names_out ())
df_test_X_vec.corr()

(2976, 2702)


In [10]:
corr_unstack = df_test_X_vec.corr().unstack()
corr_unstack = corr_unstack[corr_unstack != 1.0]
corr_unstack = corr_unstack.sort_values().drop_duplicates()
print("Strong negative correlations:")
print(corr_unstack.nsmallest(10))
print()
print("Strong positive correlations:")
print(corr_unstack.nlargest(10))

Strong negative correlations:
patient         set        -0.294474
intervertebral  patient    -0.165295
heart           tissue     -0.136525
wound           heart      -0.127911
patient         pancreas   -0.122850
set             left       -0.119657
adrenal         patient    -0.117364
patient         hilar      -0.113338
joint           tube       -0.109961
coronary        tissue     -0.109453
dtype: float64

Strong positive correlations:
elevated      290                   0.953302
mmhg          290                   0.912563
homograft     hydroxychloroquine    0.894367
carboplatin   taxol                 0.894277
mmhg          elevated              0.869889
phosphorus    126000                0.865880
qam           phosphorus            0.865880
bicuspid      qam                   0.865880
hypodense     intramural            0.865880
semicircular  intermuscular         0.865880
dtype: float64


In [26]:
model_pipeline = imbPipeline([
        ('preprocessing',CountVectorizer()),
        #('svd', TruncatedSVD(n_components=100)),
        ('smote', SMOTE(random_state=42)),
        ('classifier', LogisticRegression(random_state=42, multi_class='multinomial')), # remainder="passthrough"
])

In [27]:
# poor preformance without fine tuning
lr = model_pipeline.fit(X_train, y_train)
y_pred = lr.predict(X_test)
category_list = df_test.medical_specialty.unique()
print(classification_report(y_test, y_pred, target_names=category_list))


                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.23      0.25      0.24        52
                       Urology       0.16      0.15      0.15        40
              General Medicine       0.25      0.33      0.29        42
                       Surgery       0.05      0.08      0.06        25
 SOAP / Chart / Progress Notes       0.04      0.04      0.04        23
                     Radiology       0.23      0.33      0.27        30
                    Orthopedic       0.12      0.16      0.14        57
       Obstetrics / Gynecology       0.07      0.06      0.06        51
                     Neurology       0.15      0.14      0.14        35
              Gastroenterology       0.35      0.26      0.30       212
    Consult - History and Phy.       0.21      0.24      0.23        29

                      accuracy                           0.21       596
                     macro avg       0.17      0.19      0.18 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# take best model from grid search and perform evaluation

param_grid = [
    { 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
      'classifier': [LogisticRegression(multi_class='multinomial', random_state=42)],
      'classifier__solver': ['saga', 'lbfgs', 'liblinear'],
      'classifier__penalty': ['none', 'l1', 'l2', 'elasticnet'],
    }
]

def grid_search(X_train, y_train, model_pipeline, param_grid):
    search = GridSearchCV(model_pipeline, param_grid, cv=5)
    search.fit(X_train, y_train)
    print("Best parameter", search.best_params_)
    return search.best_estimator_
    
best_model = grid_search(X_train, y_train, model_pipeline, param_grid)
y_pred = best_model.predict(X_test)

category_list = df_test.medical_specialty.unique()
# predict and evaluate
print(classification_report(y_test, y_pred, target_names=category_list))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KeyboardInterrupt: 

In [38]:
# Prediction for sample from test set
sample = X_test.iloc[10]
print("Prediction:", lr.predict([sample]))
# Actual category of first sample from test set
print("Actual:", y_test[10])

Prediction: [' General Medicine']
Actual:  SOAP / Chart / Progress Notes


In [54]:
sample = X_test.iloc[13]
sample

"{'lip', 'abdomen', 'needle', 'endometrial', 'wall', 'anterior', 'cervix', 'tissue', 'fallopian', 'decidual', 'patient', 'tube', 'unclotted', 'liver', 'umbilicus', 'mesosalpinx', 'saline', 'vulsellum', 'vagina', 'left', 'midline', 'curettings', 'clot', 'fundus', 'peritoneum', 'omental', 'vesicouterine', 'uterus', 'uterine', 'blood'}"

In [28]:
lr.classes_

array([' Cardiovascular / Pulmonary', ' Consult - History and Phy.',
       ' Gastroenterology', ' General Medicine', ' Neurology',
       ' Obstetrics / Gynecology', ' Orthopedic', ' Radiology',
       ' SOAP / Chart / Progress Notes', ' Surgery', ' Urology'],
      dtype='<U30')

In [43]:
# Prediction for sample from test set
sample = X_test.iloc[30]
print("Prediction:", lr.predict([sample]))
# Predict probabilities for a sample from test set
prob_array = lr.predict_proba(X_test)[30,:]
prob_df = pd.DataFrame(prob_array, index=lr.classes_, columns=['Probability']).sort_values(by='Probability', ascending=False)
prob_df.Probability = prob_df.Probability.round(3)
print(prob_df)

Prediction: [' Radiology']
                                Probability
 Radiology                            0.611
 Cardiovascular / Pulmonary           0.338
 Consult - History and Phy.           0.035
 General Medicine                     0.008
 SOAP / Chart / Progress Notes        0.004
 Gastroenterology                     0.002
 Obstetrics / Gynecology              0.001
 Orthopedic                           0.001
 Surgery                              0.000
 Urology                              0.000
 Neurology                            0.000


In [50]:
to_pred = "heart racing chest pain"
def predict_probability(model: imblearn.pipeline.Pipeline, value) -> pd.DataFrame:
    """
    get probabilities for sample

    Parameters
    ----------
    model : imblearn.pipeline.Pipeline
        best model from train.py
    value : str
        sample
    category_list: list[str]
        list of unique labels

    Returns
    -------
    pd.DataFrame
        Probabilities for labels
    """

    prob_array = model.predict_proba(value)
    prob_df = pd.DataFrame(
        prob_array, index=["Probability"], columns=model.classes_
    ).transpose().sort_values(by="Probability", ascending=False)
    return prob_df

res_df = predict_probability(lr, [to_pred])
res_df

Unnamed: 0,Probability
General Medicine,0.27283
Cardiovascular / Pulmonary,0.241453
Radiology,0.1939
Consult - History and Phy.,0.144196
SOAP / Chart / Progress Notes,0.03622
Obstetrics / Gynecology,0.035474
Gastroenterology,0.026937
Orthopedic,0.020467
Neurology,0.017649
Urology,0.007424


In [16]:
# just for exploration of features, not needed for further code
X_train_df = X_train.to_frame()
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(X_train_df["transcription_f"])
feat_df = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())
feat = list(feat_df.columns)
print(len(feat))
feat

2574


['0007',
 '005',
 '01',
 '0125',
 '020',
 '025',
 '03',
 '0395',
 '05',
 '050',
 '075',
 '092assessment1',
 '10',
 '100',
 '1001',
 '1007',
 '100complications',
 '102',
 '103',
 '1032',
 '104',
 '107',
 '108',
 '10872',
 '109',
 '10drains',
 '11',
 '11000',
 '1100000',
 '11070',
 '12',
 '120',
 '1200000',
 '12161',
 '122',
 '126000',
 '129',
 '12959',
 '12yearold',
 '13',
 '131',
 '13172',
 '136',
 '137',
 '13878',
 '13975',
 '13gauge',
 '14',
 '14078',
 '14080',
 '143',
 '14383',
 '15',
 '150',
 '1500',
 '155',
 '158',
 '16',
 '16080',
 '168',
 '16870',
 '17',
 '17291',
 '18',
 '180110',
 '1937',
 '19373',
 '1cc',
 '1well',
 '20',
 '200',
 '2004',
 '2008',
 '2020',
 '2030',
 '204',
 '20meql',
 '21',
 '22',
 '223',
 '22591',
 '23',
 '24',
 '24hour',
 '25',
 '250',
 '25mg',
 '26',
 '269000',
 '27',
 '290',
 '297',
 '298',
 '2hypercholesterolemia',
 '30',
 '300',
 '313',
 '31493',
 '32',
 '32095',
 '32593',
 '32mm',
 '32yearold',
 '3431anesthesia',
 '35',
 '35000',
 '355c',
 '358fmsa',
 

## Test lime

In [18]:
import lime
import lime.lime_tabular
print("lime does not work here but works in notebook 02-hp-lrmodel-nlpinput.ipynb\ndifference of X_train split before or after pipeline")

explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=vectorizer.get_feature_names_out(), class_names=category_list)
# num features is the number of features to be shown
# top lables is the number of labels with the highest probability to be shown
exp = explainer.explain_instance(X_test[4], lr.predict_proba, num_features=5, top_labels=2)
exp.show_in_notebook(show_table=True, show_all=False)


lime does not work here but works in notebook 02-hp-lrmodel-nlpinput.ipynb
difference of X_train split before or after pipeline


IndexError: tuple index out of range

## Test Shap 

In [21]:
import shap
from pydoc import classname
print("lime does not work here but works in notebook 02-hp-lrmodel-nlpinput.ipynb\ndifference of X_train split before or after pipeline")
explainer = shap.LinearExplainer(best_model, X_train, feature_dependence="interventional")
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, class_names= category_list, feature_names=vectorizer.get_feature_names_out())

lime does not work here but works in notebook 02-hp-lrmodel-nlpinput.ipynb
difference of X_train split before or after pipeline


The option feature_dependence has been renamed to feature_perturbation!
The feature_perturbation option is now deprecated in favor of using the appropriate masker (maskers.Independent, or maskers.Impute)


InvalidModelError: An unknown model type was passed: <class 'imblearn.pipeline.Pipeline'>