In [1]:
## Import Libraries

import pandas as pd
import numpy as np
import string
import re
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE    

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from imblearn.over_sampling import SMOTE


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hannahpetry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hannahpetry/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hannahpetry/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/hannahpetry/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# load data
df = pd.read_csv('../data/raw/mtsamples.csv')
df.transcription=df.transcription.astype(str)

In [3]:
# remove rows with missing values
def clean_df(data):
    df = data.dropna().drop_duplicates() 
    # drop rows where medical specialty appears less than 100 times
    df = df[df.groupby("medical_specialty")["medical_specialty"].transform('size') > 100]
    print("Number of rows after removing medical specialties with less than 50 samples:", len(df.index))
    # remove unnecessary columns, only keep transcriptions and medical_specialty columns
    return df[['transcription', 'medical_specialty']]


# remove punctuation and lowercase and lemmatizer
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def remove_punct_lower(data):
    '''remove punctuation and lowercase'''
    data["transcription_c"] = data["transcription"].apply(lambda x: x.lower().translate(str.maketrans('','', string.punctuation)))
    return data

def lemmatize_words(data):
    '''lemmatize words, remove stopwords'''
    data["transcription_c"] = data["transcription_c"].apply(lambda x: [lemmatizer.lemmatize(x) for x in word_tokenize(x) if x not in (stop)])
    return data
 


#apply on dataset
df_m = clean_df(df)
df_test = remove_punct_lower(df_m)
df_test = lemmatize_words(df_test)


df_test.head(2)

Number of rows after removing medical specialties with less than 50 samples: 2976


Unnamed: 0,transcription,medical_specialty,transcription_c
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,"[2d, mmode, 1, left, atrial, enlargement, left..."
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,"[1, left, ventricular, cavity, size, wall, thi..."


In [1]:
!pip3 install ../data/en_ner_bionlp13cg_md-0.5.1

Processing /Users/hannahpetry/Desktop/Work Project/NLP_Masterthesis/data/en_ner_bionlp13cg_md-0.5.1
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: en-ner-bionlp13cg-md
  Building wheel for en-ner-bionlp13cg-md (setup.py) ... [?25ldone
[?25h  Created wheel for en-ner-bionlp13cg-md: filename=en_ner_bionlp13cg_md-0.5.1-py3-none-any.whl size=120241147 sha256=a129e3af6632c973998a4e4c1ed6847eefa021aa97916119d25140e0c1a8d344
  Stored in directory: /Users/hannahpetry/Library/Caches/pip/wheels/ba/dc/7b/d8d45322e45cc36e3226aef67695c2eaa65d737e0def574ef2
Successfully built en-ner-bionlp13cg-md
Installing collected packages: en-ner-bionlp13cg-md
Successfully installed en-ner-bionlp13cg-md-0.5.1


In [4]:
# NLP with Spacy
import spacy
import en_ner_bionlp13cg_md
nlp = en_ner_bionlp13cg_md.load()
def medical_entities( text):
    entities = []
    doc = nlp(text)
    for ent in doc.ents:
        entities.append(ent.text)
    return set(entities)#' ,'.join(entities)


df_test['transcription_c'] = [','.join(map(str, l)) for l in df_test['transcription_c']]
df_test['transcription_f'] = df_test['transcription_c'].apply(medical_entities)
df_test.head()

Unnamed: 0,transcription,medical_specialty,transcription_c,transcription_f
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,"2d,mmode,1,left,atrial,enlargement,left,atrial...","{ventricular, pulmonary, mitral, left, valve}"
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,"1,left,ventricular,cavity,size,wall,thickness,...","{ventricular, lipomatous, wall, valve, mitral,..."
7,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary,"2d,echocardiogrammultiple,view,heart,great,ves...","{septum, cardiac, vessel, venous, valve, pulmo..."
9,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary,"description1,normal,cardiac,chamber,size2,norm...","{ventricular, cardiac, mitral, left, valve}"
11,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary,"2d,study1,mild,aortic,stenosis,widely,calcifie...","{ventricular, mitral, ventricle, heart, left}"


In [6]:
df_test.transcription_f = df_test.transcription_f.astype(str)
df_test.transcription_f

3       {'ventricular', 'pulmonary', 'mitral', 'left',...
4       {'ventricular', 'lipomatous', 'wall', 'valve',...
7       {'septum', 'cardiac', 'vessel', 'venous', 'val...
9       {'ventricular', 'cardiac', 'mitral', 'left', '...
11      {'ventricular', 'mitral', 'ventricle', 'heart'...
                              ...                        
4967    {'vessel', 'rca', 'pci', 'coronary', 'renal', ...
4968    {'intravenous', 'cerebrovascular', 'patient', ...
4971    {'men', 'teeth', 'coronary', 'thyroid', 'heart...
4972    {'patient', 'wall', 'heart', 'myocardial', 'ad...
4975    {'cardiac', 'patient', 'pci', 'lipid', 'lesion...
Name: transcription_f, Length: 2976, dtype: object

In [7]:
# create sparse matrix from transcription_f
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_test.transcription_f)
X.shape

(2976, 2702)

In [10]:
labels = df_test['medical_specialty'].tolist()
# balance out dataset with SMOTE, creates synthetic samples of the minority classes
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, labels)
X_res, y_res

(<11253x2702 sparse matrix of type '<class 'numpy.int64'>'
 	with 205251 stored elements in Compressed Sparse Row format>,
 [' Cardiovascular / Pulmonary',
  ' Cardiovascular / Pulmonary',
  ' Cardiovascular / Pulmonary',
  ' Cardiovascular / Pulmonary',
  ' Cardiovascular / Pulmonary',
  ' Cardiovascular / Pulmonary',
  ' Urology',
  ' General Medicine',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology'

In [11]:
# Create X and y and apply PCA to reduce dimensionality of features
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_res.toarray())



In [12]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_res, test_size=0.2, random_state=42)

In [13]:
X_test[0]

array([-3.55159663e-01, -1.87993093e-01, -2.50896645e-01, -3.43875311e-01,
       -2.29288159e-01, -2.47844659e-02, -9.00129918e-02,  4.90100301e-02,
       -7.96205884e-02, -6.50179952e-02, -3.45370418e-03,  4.90637408e-02,
       -4.05698914e-03,  4.13094473e-03,  1.91466169e-02, -2.91000186e-03,
        2.84509865e-02,  1.02016116e-02,  2.63273191e-02, -3.01821604e-02,
        4.51546950e-02, -3.75697547e-02,  2.69456217e-02,  1.44059780e-02,
       -1.21374859e-02,  1.46489914e-02,  1.42242811e-02, -1.04913812e-02,
        2.33216441e-02, -2.85275087e-02,  2.68553559e-02,  4.75455998e-02,
        1.24693920e-02, -3.25340437e-03,  1.17410516e-02,  1.53446618e-02,
        9.20824613e-03, -6.13779638e-03, -2.51127416e-02,  2.05516387e-02,
       -1.99440655e-02,  1.02549169e-02,  1.93995539e-02,  6.14656955e-03,
       -1.47321466e-02, -1.01622155e-02, -1.63419176e-02, -1.55221060e-02,
       -1.56414544e-02, -5.32948616e-03, -1.00864723e-02, -1.12789195e-02,
       -2.87630015e-03,  

In [24]:
# build model with finetuned hyperparameters
clf = LogisticRegression(random_state=42, penalty= 'l1', solver= 'saga', multi_class='multinomial', C=1)
lr = Pipeline(steps=[('classifier', clf)]).fit(X_train, y_train)



In [25]:
category_list = df_test.medical_specialty.unique()
# predict and evaluate
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred, target_names=category_list))

                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.71      0.75      0.73       208
                       Urology       0.71      0.55      0.62       218
              General Medicine       0.75      0.82      0.79       191
                       Surgery       0.77      0.83      0.80       213
 SOAP / Chart / Progress Notes       0.72      0.81      0.77       193
                     Radiology       0.76      0.89      0.82       202
                    Orthopedic       0.66      0.72      0.69       202
       Obstetrics / Gynecology       0.54      0.50      0.52       191
                     Neurology       0.68      0.82      0.74       225
              Gastroenterology       0.53      0.23      0.32       203
    Consult - History and Phy.       0.87      0.92      0.89       205

                      accuracy                           0.71      2251
                     macro avg       0.70      0.71      0.70 

In [26]:
# ROC curve
# Generate class membership probabilities
y_preb_probs = lr.predict_proba(X_test)
score = roc_auc_score(y_test, y_preb_probs, average="weighted", multi_class="ovr")
print('Average AUROC score of', round(score,4))

Average AUROC score of 0.9517


In [27]:
# version of accuracy that also integrates measurements of chance and class imbalance
# Generally, a score above 0.8 is considered excellent
cohen_kappa_score(y_test, y_pred)

0.6847131496502603

In [36]:
# Predict probabilities for a sample from test set
prob_array = lr.predict_proba(X_test)[0,:]
prob_df = pd.DataFrame(prob_array, index=category_list, columns=['Probability']).sort_values(by='Probability', ascending=False)
prob_df


Unnamed: 0,Probability
Radiology,0.684487
Gastroenterology,0.107089
Obstetrics / Gynecology,0.093361
Consult - History and Phy.,0.037043
General Medicine,0.020825
Urology,0.01847
Neurology,0.014036
Cardiovascular / Pulmonary,0.012928
Orthopedic,0.005452
SOAP / Chart / Progress Notes,0.003385


In [40]:
# need to know which sample was predicted
# get index of sample from test set
index = 0
# get sample from test set
sample = X_test[index]
# get predicted category
category = y_pred[index]
# get probability of predicted category
prob = prob_df.loc[category].values[0]
# get actual category
actual = y_test[index]

print("Predicted category:", category)
print("Probability of predicted category:", prob)
print("Actual category:", actual)


Predicted category:  Obstetrics / Gynecology
Probability of predicted category: 0.09336081527509868
Actual category:  Obstetrics / Gynecology


In [41]:
y_pred

array([' Obstetrics / Gynecology', ' General Medicine', ' Radiology', ...,
       ' SOAP / Chart / Progress Notes', ' SOAP / Chart / Progress Notes',
       ' Consult - History and Phy.'], dtype='<U30')