In [1]:
## Import Libraries

import pandas as pd
import numpy as np
import string
import re
import nltk
import imblearn

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE    

from imblearn.over_sampling import SMOTE

In [2]:
# load data
df_test = pd.read_csv('../data/processed/mtsamples_nlp.csv')
df_test.transcription=df_test.transcription.astype(str)
df_test.tail()
df_test.transcription_f[2973]

"{'muscle', 'coronary', 'fat', 'nitroglycerin', 'teeth', 'andor', 'blood', 'heart', 'breast', 'oxygen', 'valve', 'ear', 'thyroid', 'men', 'bone', 'tablet', 'salt', 'artery'}"

In [3]:
# retrieve labels as function
def get_labels(data):
    return data['medical_specialty'].tolist()

df_test_label = get_labels(df_test)

df_test_X = df_test['transcription_f'].astype(str)
df_test_X.head()

0    {'pulmonary', 'ventricular', 'left', 'valve', ...
1    {'pulmonary', 'lipomatous', 'leaflet', 'ventri...
2    {'pulmonary', 'arch', 'coronary', 'inflow', 'a...
3    {'ventricular', 'left', 'valve', 'cardiac', 'm...
4    {'ventricular', 'left', 'heart', 'ventricle', ...
Name: transcription_f, dtype: object

In [4]:
# split data into train and test set (first split data into train and test set to only transform the train set)
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df_test_X, df_test_label)

### Feature Exploration

In [5]:
# transform df_test_label to dataframe
df_label = pd.DataFrame(df_test_label)
df_label.columns = ['label']
df_label

Unnamed: 0,label
0,Cardiovascular / Pulmonary
1,Cardiovascular / Pulmonary
2,Cardiovascular / Pulmonary
3,Cardiovascular / Pulmonary
4,Cardiovascular / Pulmonary
...,...
2971,Cardiovascular / Pulmonary
2972,Cardiovascular / Pulmonary
2973,Cardiovascular / Pulmonary
2974,Cardiovascular / Pulmonary


In [6]:
# vectorize df_test_X to dataframe
vectorizer = CountVectorizer()
df_test_X_vec = vectorizer.fit_transform(df_test_X)
df_test_X_vec = df_test_X_vec.toarray()
print(df_test_X_vec.shape)

df_test_X_vec = pd.DataFrame(df_test_X_vec, columns=vectorizer.get_feature_names_out ())
df_test_X_vec

(2976, 2702)


Unnamed: 0,0007,005,01,0125,020,025,03,0395,05,050,...,zithromax,zocor,zofran,zoladex,zoloft,zometa,zone,zygoma,zygomatic,zyprexa
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2971,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2972,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2973,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2974,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [241]:
# join df_label and df_test_X_vec on index
df = df_test_X_vec.join(df_label)
# group by label and show which columns appear the most in each label
df.groupby('label').sum()

Unnamed: 0_level_0,0007,005,01,0125,020,025,03,0395,05,050,...,zithromax,zocor,zofran,zoladex,zoloft,zometa,zone,zygoma,zygomatic,zyprexa
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cardiovascular / Pulmonary,0,0,1,0,0,0,0,0,1,0,...,1,1,0,0,0,0,0,0,0,0
Consult - History and Phy.,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,1,0,1,0,0,1
Gastroenterology,0,0,0,0,0,2,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
General Medicine,0,0,0,1,0,0,1,0,0,0,...,1,0,1,0,1,0,1,0,0,0
Neurology,1,1,0,0,0,0,0,1,1,0,...,1,1,0,0,0,0,0,0,0,1
Obstetrics / Gynecology,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
Orthopedic,1,0,0,0,0,5,0,0,16,1,...,1,0,0,0,0,0,0,0,0,0
Radiology,1,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,1,0
SOAP / Chart / Progress Notes,0,0,2,0,0,0,1,1,2,0,...,0,1,0,0,2,1,0,0,0,0
Surgery,0,3,0,0,1,21,0,0,30,1,...,1,1,0,1,0,0,0,6,1,0


In [242]:
# show top symptoms with word count
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

top_words = get_top_n_words(df_test_X, n=5)
print("Symptoms occuring the most:", top_words)

# show top symptoms per label with word count (on whole dataset, not yet split into train and test set and smoothed out)
top_words_label = df.groupby('label').sum().apply(lambda x: x.sort_values(ascending=False).head(10), axis=1)

top_Cardiovascular = top_words_label.iloc[0].loc[top_words_label.iloc[0]>0].sort_values(ascending=False)
print("Top Words in Cardiovascular/Pulmonary:\n", top_Cardiovascular)

top_Consult = top_words_label.iloc[1].loc[top_words_label.iloc[1]>0].sort_values(ascending=False)
print("Top Words in Consult - History and Phy.	:\n", top_Consult)

top_Gastroenterology = top_words_label.iloc[2].loc[top_words_label.iloc[2]>0].sort_values(ascending=False)
print("Top Words in Gastroenterology:\n", top_Gastroenterology)

top_General_Medicine = top_words_label.iloc[3].loc[top_words_label.iloc[3]>0].sort_values(ascending=False)
print("Top Words in General Medicine:\n", top_General_Medicine)

top_Neurology = top_words_label.iloc[4].loc[top_words_label.iloc[4]>0].sort_values(ascending=False)
print("Top Words in Neurology:\n", top_Neurology)

top_Obstetrics_Gynecology = top_words_label.iloc[5].loc[top_words_label.iloc[5]>0].sort_values(ascending=False)
print("Top Words in Obstetrics/Gynecology:\n", top_Obstetrics_Gynecology)

top_Orthopedic = top_words_label.iloc[6].loc[top_words_label.iloc[6]>0].sort_values(ascending=False)
print("Top Words in Orthopedic:\n", top_Orthopedic)

top_Radiology = top_words_label.iloc[7].loc[top_words_label.iloc[7]>0].sort_values(ascending=False)
print("Top Words in Radiology:\n", top_Radiology)

top_SOAP = top_words_label.iloc[8].loc[top_words_label.iloc[8]>0].sort_values(ascending=False)
print("Top Words in SOAP:\n", top_SOAP)

top_Surgery = top_words_label.iloc[9].loc[top_words_label.iloc[9]>0].sort_values(ascending=False)
print("Top Words in Surgery:\n", top_Surgery)

top_Urology = top_words_label.iloc[10].loc[top_words_label.iloc[10]>0].sort_values(ascending=False)
print("Top Words in Urology:\n", top_Urology)


Symptoms occuring the most: [('patient', 2435), ('left', 1338), ('blood', 1127), ('skin', 1042), ('tissue', 742)]
Top Words in Cardiovascular/Pulmonary:
 patient        218.0
left           169.0
artery         106.0
heart          106.0
blood          105.0
ventricular     81.0
wall            74.0
coronary        72.0
skin            66.0
lung            65.0
Name:  Cardiovascular / Pulmonary, dtype: float64
Top Words in Consult - History and Phy.	:
 patient    167.0
blood       89.0
left        71.0
abdomen     59.0
bowel       56.0
heart       56.0
lung        53.0
skin        47.0
eye         46.0
edema       44.0
Name:  Consult - History and Phy., dtype: float64
Top Words in Gastroenterology:
 patient      162.0
abdominal     83.0
abdomen       79.0
left          60.0
bowel         59.0
colon         59.0
blood         57.0
esophagus     49.0
skin          49.0
stomach       48.0
Name:  Gastroenterology, dtype: float64
Top Words in General Medicine:
 patient    97.0
blood      69

### Feature Selection 

In [7]:
# show highly correlated features (what is the best threshold for correlation?) (stimmt das wirklich?)
corr_unstack = df_test_X_vec.corr(method = "spearman").abs().unstack()
corr_unstack = corr_unstack[corr_unstack != 1.0]
corr_unstack = corr_unstack.sort_values().drop_duplicates()
# how many features are highly correlated?
print(len(corr_unstack[corr_unstack > 0.85]))

print("Strong correlations:")
print(corr_unstack.nlargest(10))

6
Strong correlations:
290       elevated       0.953302
mmhg      290            0.912563
taxol     carboplatin    0.894277
mmhg      elevated       0.869889
bicuspid  129            0.865880
gingiva   salivary       0.865734
brachium  semiovale      0.844870
node      lymph          0.837753
mm        use            0.816359
fraction  602            0.816222
dtype: float64


### Step by Step Model

In [41]:
# vectorize X_train 
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)

In [42]:
# smote oversampling
sm = SMOTE(random_state=42)
X_train_vec, y_train = sm.fit_resample(X_train_vec, y_train)
X_train_vec

<8921x2574 sparse matrix of type '<class 'numpy.int64'>'
	with 163353 stored elements in Compressed Sparse Row format>

In [None]:
class Decorrelator(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold):
        self.threshold = threshold

    def fit(self, X, y=None):
        X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out ())  
        corr_matrix = X.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [column for column in upper.columns if any(upper[column] > self.threshold)]
        
        return self

    def transform(self, X, y=None, **kwargs):
        return (pd.DataFrame(X)).drop(labels=self.correlated_features, axis=1)

In [43]:
# remove highly correlated features
def remove_highly_correlated_features(X, threshold):
    X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out ())
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    X = X.drop(X[to_drop], axis=1)
    return X


remove_highly_correlated_features(X_train_vec, 0.85)


Unnamed: 0,0007,005,01,0125,020,025,03,0395,05,050,...,year,yellowish,yolk,zithromax,zocor,zofran,zoloft,zone,zygoma,zygomatic
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8916,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8918,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8919,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Model Pipeline

In [5]:
# The “saga” solver is a variant of “sag” that also supports the non-smooth penalty="l1" 
# This is therefore the solver of choice for sparse multinomial logistic regression
model_pipeline = imbPipeline([
        ('preprocessing',CountVectorizer()),
        ('smote', SMOTE(random_state=42)),
        #('decorrelation', remove_highly_correlated_features(0.85)),
        ('classifier', LogisticRegression(random_state=42, multi_class='multinomial', solver='saga')),
])

In [6]:
# poor performance without fine tuning
lr = model_pipeline.fit(X_train, y_train)
y_pred = lr.predict(X_test)
category_list = df_test.medical_specialty.unique()
print(classification_report(y_test, y_pred, target_names=lr.classes_))

                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.25      0.27      0.26        52
    Consult - History and Phy.       0.16      0.15      0.15        40
              Gastroenterology       0.25      0.31      0.27        42
              General Medicine       0.05      0.08      0.06        25
                     Neurology       0.04      0.04      0.04        23
       Obstetrics / Gynecology       0.24      0.33      0.28        30
                    Orthopedic       0.11      0.14      0.12        57
                     Radiology       0.07      0.06      0.06        51
 SOAP / Chart / Progress Notes       0.15      0.14      0.14        35
                       Surgery       0.35      0.26      0.30       212
                       Urology       0.20      0.24      0.22        29

                      accuracy                           0.21       596
                     macro avg       0.17      0.18      0.17 



In [54]:
# take best model from grid search and perform evaluation

param_grid = [
    { 'classifier__C': [0.01, 0.1, 1, 10],
      'classifier': [LogisticRegression(multi_class='multinomial', random_state=42, solver='saga')],
      'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    }
]

def grid_search(X_train, y_train, model_pipeline, param_grid):
    search = GridSearchCV(model_pipeline, param_grid, cv=5)
    search.fit(X_train, y_train)
    print("Best parameter", search.best_params_)
    return search.best_estimator_
    
best_model = grid_search(X_train, y_train, model_pipeline, param_grid)
y_pred = best_model.predict(X_test)

category_list = df_test.medical_specialty.unique()
# predict and evaluate
print(classification_report(y_test, y_pred, target_names=best_model.classes_))



KeyboardInterrupt: 

In [31]:
# sample from test set
sample = X_test.iloc[13]
sample

"{'lip', 'abdomen', 'needle', 'endometrial', 'wall', 'anterior', 'cervix', 'tissue', 'fallopian', 'decidual', 'patient', 'tube', 'unclotted', 'liver', 'umbilicus', 'mesosalpinx', 'saline', 'vulsellum', 'vagina', 'left', 'midline', 'curettings', 'clot', 'fundus', 'peritoneum', 'omental', 'vesicouterine', 'uterus', 'uterine', 'blood'}"

In [8]:
# Prediction for sample from test set
sample = X_test.iloc[13]
print("Prediction:", lr.predict([sample]))
# Actual category of first sample from test set
print("Actual:", y_test[13])
# Predict probabilities for a sample from test set
prob_array = lr.predict_proba(X_test)[13,:]
prob_df = pd.DataFrame(prob_array, index=lr.classes_, columns=['Probability']).sort_values(by='Probability', ascending=False)
prob_df.Probability = prob_df.Probability.round(3)
print(prob_df)

Prediction: [' Surgery']
Actual:  Obstetrics / Gynecology
                                Probability
 Surgery                              0.686
 Obstetrics / Gynecology              0.314
 Radiology                            0.000
 Gastroenterology                     0.000
 Cardiovascular / Pulmonary           0.000
 Orthopedic                           0.000
 Urology                              0.000
 Consult - History and Phy.           0.000
 Neurology                            0.000
 SOAP / Chart / Progress Notes        0.000
 General Medicine                     0.000


In [34]:
to_pred = "lip abdomen needle endometrial wall anterior cervix"
def predict_probability(model: imblearn.pipeline.Pipeline, value) -> pd.DataFrame:
    """
    get probabilities for sample

    Parameters
    ----------
    model : imblearn.pipeline.Pipeline
        best model from train.py
    value : str
        sample
    category_list: list[str]
        list of unique labels

    Returns
    -------
    pd.DataFrame
        Probabilities for labels
    """

    prob_array = model.predict_proba(value)
    prob_df = pd.DataFrame(
        prob_array, index=["Probability"], columns=model.classes_
    ).transpose().sort_values(by="Probability", ascending=False)
    return prob_df

res_df = predict_probability(best_model, [to_pred])
res_df

Unnamed: 0,Probability
Obstetrics / Gynecology,0.29932
Radiology,0.148991
Gastroenterology,0.123282
Cardiovascular / Pulmonary,0.070136
Surgery,0.069127
General Medicine,0.062495
Consult - History and Phy.,0.061271
Neurology,0.050716
Urology,0.03904
SOAP / Chart / Progress Notes,0.038013


In [10]:
# just for exploration of features, not needed for further code
X_train_df = X_train.to_frame()
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(X_train_df["transcription_f"])
feat_df = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())
feat = list(feat_df.columns)
print(len(feat))
feat

2574


['0007',
 '005',
 '01',
 '0125',
 '020',
 '025',
 '03',
 '0395',
 '05',
 '050',
 '075',
 '092assessment1',
 '10',
 '100',
 '1001',
 '1007',
 '100complications',
 '102',
 '103',
 '1032',
 '104',
 '107',
 '108',
 '10872',
 '109',
 '10drains',
 '11',
 '11000',
 '1100000',
 '11070',
 '12',
 '120',
 '1200000',
 '12161',
 '122',
 '126000',
 '129',
 '12959',
 '12yearold',
 '13',
 '131',
 '13172',
 '136',
 '137',
 '13878',
 '13975',
 '13gauge',
 '14',
 '14078',
 '14080',
 '143',
 '14383',
 '15',
 '150',
 '1500',
 '155',
 '158',
 '16',
 '16080',
 '168',
 '16870',
 '17',
 '17291',
 '18',
 '180110',
 '1937',
 '19373',
 '1cc',
 '1well',
 '20',
 '200',
 '2004',
 '2008',
 '2020',
 '2030',
 '204',
 '20meql',
 '21',
 '22',
 '223',
 '22591',
 '23',
 '24',
 '24hour',
 '25',
 '250',
 '25mg',
 '26',
 '269000',
 '27',
 '290',
 '297',
 '298',
 '2hypercholesterolemia',
 '30',
 '300',
 '313',
 '31493',
 '32',
 '32095',
 '32593',
 '32mm',
 '32yearold',
 '3431anesthesia',
 '35',
 '35000',
 '355c',
 '358fmsa',
 

## Test lime

In [11]:
import lime
import lime.lime_tabular
print("lime does not work here but works in notebook 02-hp-lrmodel-nlpinput.ipynb\ndifference of X_train split before or after pipeline")

explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=vectorizer.get_feature_names_out(), class_names=lr.classes_)
# num features is the number of features to be shown
# top labels is the number of labels with the highest probability to be shown
exp = explainer.explain_instance(X_test[4], lr.predict_proba, num_features=5, top_labels=2)
exp.show_in_notebook(show_table=True, show_all=False)


lime does not work here but works in notebook 02-hp-lrmodel-nlpinput.ipynb
difference of X_train split before or after pipeline


IndexError: tuple index out of range

## Test Shap 

In [12]:
import shap
from pydoc import classname
X_train = model_pipeline.named_steps['preprocessing'].fit_transform(X_train)
X_train = model_pipeline.named_steps['smote'].fit_transform(X_train, y_train)

AttributeError: 'SMOTE' object has no attribute 'fit_transform'

In [87]:
explainer = shap.KernelExplainer(model_pipeline.named_steps['classifier'].predict_proba, X_train)

Using 2380 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


In [75]:
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar", class_names= lr.classes_, feature_names=vectorizer.get_feature_names_out())

TypeError: 'NoneType' object is not subscriptable

In [60]:
import shap
from pydoc import classname
print("shap does not work here but works in notebook 02-hp-lrmodel-nlpinput.ipynb\ndifference of X_train split before or after pipeline")
explainer = shap.Explainer(lr.predict, X_train)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, class_names= lr.classes_, feature_names=vectorizer.get_feature_names_out())

lime does not work here but works in notebook 02-hp-lrmodel-nlpinput.ipynb
difference of X_train split before or after pipeline


AttributeError: module 'warnings' has no attribute 'DeprecationWarning'