In [88]:
## Import Libraries

import pandas as pd
import numpy as np
import string
import re
import nltk
import imblearn

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE    

from imblearn.over_sampling import SMOTE

In [89]:
# load data
df_test = pd.read_csv('../data/processed/mtsamples_nlp.csv')
df_test.transcription=df_test.transcription.astype(str)
df_test.tail()
df_test.transcription_f[2973]

"{'muscle', 'coronary', 'fat', 'nitroglycerin', 'teeth', 'andor', 'blood', 'heart', 'breast', 'oxygen', 'valve', 'ear', 'thyroid', 'men', 'bone', 'tablet', 'salt', 'artery'}"

In [90]:
# retrieve labels as function
def get_labels(data):
    return data['medical_specialty'].tolist()

df_test_label = get_labels(df_test)

df_test_X = df_test['transcription_f'].astype(str)

In [91]:
# split data into train and test set (first split data into train and test set to only transform the train set)
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df_test_X, df_test_label)

In [92]:
# vectorize df_test_X for correlation matrix between features
vectorizer = CountVectorizer()
df_test_X_vec = vectorizer.fit_transform(df_test_X)
df_test_X_vec = df_test_X_vec.toarray()
print(df_test_X_vec.shape)

df_test_X_vec = pd.DataFrame(df_test_X_vec, columns=vectorizer.get_feature_names_out ())
#df_test_X_vec.corr()

(2976, 2702)


In [93]:
# show highly correlated features (what is the best threshold for correlation?)
corr_unstack = df_test_X_vec.corr().unstack()
corr_unstack = corr_unstack[corr_unstack != 1.0]
corr_unstack = corr_unstack.sort_values().drop_duplicates()
print("Strong negative correlations:")
print(corr_unstack.nsmallest(10))
print("Strong positive correlations:")
print(corr_unstack.nlargest(10))

Strong negative correlations:
patient         set        -0.294474
intervertebral  patient    -0.165295
heart           tissue     -0.136525
wound           heart      -0.127911
patient         pancreas   -0.122850
set             left       -0.119657
adrenal         patient    -0.117364
patient         hilar      -0.113338
joint           tube       -0.109961
coronary        tissue     -0.109453
dtype: float64
Strong positive correlations:
elevated      290                   0.953302
mmhg          290                   0.912563
homograft     hydroxychloroquine    0.894367
carboplatin   taxol                 0.894277
mmhg          elevated              0.869889
phosphorus    126000                0.865880
qam           phosphorus            0.865880
bicuspid      qam                   0.865880
hypodense     intramural            0.865880
semicircular  intermuscular         0.865880
dtype: float64


In [81]:
# The “saga” solver is a variant of “sag” that also supports the non-smooth penalty="l1" 
# This is therefore the solver of choice for sparse multinomial logistic regression
model_pipeline = imbPipeline([
        ('preprocessing',CountVectorizer()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', LogisticRegression(random_state=42, multi_class='multinomial', solver='saga')),
])

In [82]:
# poor performance without fine tuning
lr = model_pipeline.fit(X_train, y_train)
y_pred = lr.predict(X_test)
category_list = df_test.medical_specialty.unique()
print(classification_report(y_test, y_pred, target_names=lr.classes_))

                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.25      0.27      0.26        52
    Consult - History and Phy.       0.16      0.15      0.15        40
              Gastroenterology       0.25      0.31      0.27        42
              General Medicine       0.05      0.08      0.06        25
                     Neurology       0.04      0.04      0.04        23
       Obstetrics / Gynecology       0.24      0.33      0.28        30
                    Orthopedic       0.11      0.14      0.12        57
                     Radiology       0.07      0.06      0.06        51
 SOAP / Chart / Progress Notes       0.15      0.14      0.14        35
                       Surgery       0.35      0.26      0.30       212
                       Urology       0.20      0.24      0.22        29

                      accuracy                           0.21       596
                     macro avg       0.17      0.18      0.17 

The max_iter was reached which means the coef_ did not converge


In [30]:
# take best model from grid search and perform evaluation

param_grid = [
    { 'classifier__C': [0.01, 0.1, 1, 10],
      'classifier': [LogisticRegression(multi_class='multinomial', random_state=42, solver='saga')],
      'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    }
]

def grid_search(X_train, y_train, model_pipeline, param_grid):
    search = GridSearchCV(model_pipeline, param_grid, cv=5)
    search.fit(X_train, y_train)
    print("Best parameter", search.best_params_)
    return search.best_estimator_
    
best_model = grid_search(X_train, y_train, model_pipeline, param_grid)
y_pred = best_model.predict(X_test)

category_list = df_test.medical_specialty.unique()
# predict and evaluate
print(classification_report(y_test, y_pred, target_names=best_model.classes_))

20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/hannahpetry/opt/anaconda3/envs/nlp_masterthesis/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/hannahpetry/opt/anaconda3/envs/nlp_masterthesis/lib/python3.10/site-packages/imblearn/pipeline.py", line 272, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/Users/hannahpetry/opt/anaconda3/envs/nlp_masterthesis/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1101, in fit
    raise ValueError(
ValueError: l

Best parameter {'classifier': LogisticRegression(C=0.01, multi_class='multinomial', random_state=42,
                   solver='saga'), 'classifier__C': 0.01, 'classifier__penalty': 'l2'}
                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.44      0.60      0.50        52
    Consult - History and Phy.       0.36      0.30      0.33        40
              Gastroenterology       0.46      0.55      0.50        42
              General Medicine       0.17      0.20      0.19        25
                     Neurology       0.21      0.30      0.25        23
       Obstetrics / Gynecology       0.37      0.57      0.45        30
                    Orthopedic       0.35      0.51      0.41        57
                     Radiology       0.15      0.10      0.12        51
 SOAP / Chart / Progress Notes       0.18      0.23      0.20        35
                       Surgery       0.58      0.37      0.45       212
                   



In [31]:
# sample from test set
sample = X_test.iloc[13]
sample

"{'lip', 'abdomen', 'needle', 'endometrial', 'wall', 'anterior', 'cervix', 'tissue', 'fallopian', 'decidual', 'patient', 'tube', 'unclotted', 'liver', 'umbilicus', 'mesosalpinx', 'saline', 'vulsellum', 'vagina', 'left', 'midline', 'curettings', 'clot', 'fundus', 'peritoneum', 'omental', 'vesicouterine', 'uterus', 'uterine', 'blood'}"

In [51]:
# Prediction for sample from test set
sample = X_test.iloc[13]
print("Prediction:", best_model.predict([sample]))
# Actual category of first sample from test set
print("Actual:", y_test[13])
# Predict probabilities for a sample from test set
prob_array = best_model.predict_proba(X_test)[13,:]
prob_df = pd.DataFrame(prob_array, index=best_model.classes_, columns=['Probability']).sort_values(by='Probability', ascending=False)
prob_df.Probability = prob_df.Probability.round(3)
print(prob_df)

Prediction: [' Obstetrics / Gynecology']
Actual:  Obstetrics / Gynecology
                                Probability
 Obstetrics / Gynecology              0.605
 Surgery                              0.283
 Cardiovascular / Pulmonary           0.032
 Gastroenterology                     0.020
 Radiology                            0.017
 Orthopedic                           0.011
 Neurology                            0.008
 Consult - History and Phy.           0.008
 SOAP / Chart / Progress Notes        0.006
 Urology                              0.005
 General Medicine                     0.004


In [34]:
to_pred = "lip abdomen needle endometrial wall anterior cervix"
def predict_probability(model: imblearn.pipeline.Pipeline, value) -> pd.DataFrame:
    """
    get probabilities for sample

    Parameters
    ----------
    model : imblearn.pipeline.Pipeline
        best model from train.py
    value : str
        sample
    category_list: list[str]
        list of unique labels

    Returns
    -------
    pd.DataFrame
        Probabilities for labels
    """

    prob_array = model.predict_proba(value)
    prob_df = pd.DataFrame(
        prob_array, index=["Probability"], columns=model.classes_
    ).transpose().sort_values(by="Probability", ascending=False)
    return prob_df

res_df = predict_probability(best_model, [to_pred])
res_df

Unnamed: 0,Probability
Obstetrics / Gynecology,0.29932
Radiology,0.148991
Gastroenterology,0.123282
Cardiovascular / Pulmonary,0.070136
Surgery,0.069127
General Medicine,0.062495
Consult - History and Phy.,0.061271
Neurology,0.050716
Urology,0.03904
SOAP / Chart / Progress Notes,0.038013


In [52]:
# just for exploration of features, not needed for further code
X_train_df = X_train.to_frame()
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(X_train_df["transcription_f"])
feat_df = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())
feat = list(feat_df.columns)
print(len(feat))
feat

2574


['0007',
 '005',
 '01',
 '0125',
 '020',
 '025',
 '03',
 '0395',
 '05',
 '050',
 '075',
 '092assessment1',
 '10',
 '100',
 '1001',
 '1007',
 '100complications',
 '102',
 '103',
 '1032',
 '104',
 '107',
 '108',
 '10872',
 '109',
 '10drains',
 '11',
 '11000',
 '1100000',
 '11070',
 '12',
 '120',
 '1200000',
 '12161',
 '122',
 '126000',
 '129',
 '12959',
 '12yearold',
 '13',
 '131',
 '13172',
 '136',
 '137',
 '13878',
 '13975',
 '13gauge',
 '14',
 '14078',
 '14080',
 '143',
 '14383',
 '15',
 '150',
 '1500',
 '155',
 '158',
 '16',
 '16080',
 '168',
 '16870',
 '17',
 '17291',
 '18',
 '180110',
 '1937',
 '19373',
 '1cc',
 '1well',
 '20',
 '200',
 '2004',
 '2008',
 '2020',
 '2030',
 '204',
 '20meql',
 '21',
 '22',
 '223',
 '22591',
 '23',
 '24',
 '24hour',
 '25',
 '250',
 '25mg',
 '26',
 '269000',
 '27',
 '290',
 '297',
 '298',
 '2hypercholesterolemia',
 '30',
 '300',
 '313',
 '31493',
 '32',
 '32095',
 '32593',
 '32mm',
 '32yearold',
 '3431anesthesia',
 '35',
 '35000',
 '355c',
 '358fmsa',
 

## Test lime

In [53]:
import lime
import lime.lime_tabular
print("lime does not work here but works in notebook 02-hp-lrmodel-nlpinput.ipynb\ndifference of X_train split before or after pipeline")

explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=vectorizer.get_feature_names_out(), class_names=best_model.classes_)
# num features is the number of features to be shown
# top labels is the number of labels with the highest probability to be shown
exp = explainer.explain_instance(X_test[4], best_model.predict_proba, num_features=5, top_labels=2)
exp.show_in_notebook(show_table=True, show_all=False)


lime does not work here but works in notebook 02-hp-lrmodel-nlpinput.ipynb
difference of X_train split before or after pipeline


IndexError: tuple index out of range

## Test Shap 

In [84]:
import shap
from pydoc import classname
X_train = model_pipeline.named_steps['preprocessing'].fit_transform(X_train)
X_train = model_pipeline.named_steps['smote'].fit_transform(X_train)

AttributeError: lower not found

In [87]:
explainer = shap.KernelExplainer(model_pipeline.named_steps['classifier'].predict_proba, X_train)

Using 2380 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


In [75]:
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar", class_names= lr.classes_, feature_names=vectorizer.get_feature_names_out())

TypeError: 'NoneType' object is not subscriptable

In [60]:
import shap
from pydoc import classname
print("lime does not work here but works in notebook 02-hp-lrmodel-nlpinput.ipynb\ndifference of X_train split before or after pipeline")
explainer = shap.Explainer(lr.predict, X_train)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, class_names= lr.classes_, feature_names=vectorizer.get_feature_names_out())

lime does not work here but works in notebook 02-hp-lrmodel-nlpinput.ipynb
difference of X_train split before or after pipeline


AttributeError: module 'warnings' has no attribute 'DeprecationWarning'