In [5]:
## Import Libraries

import pandas as pd
import numpy as np
import string
import re
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline as skPipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE    

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from imblearn.over_sampling import SMOTE

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hannahpetry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hannahpetry/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hannahpetry/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/hannahpetry/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
# load data
df = pd.read_csv('../data/raw/mtsamples.csv')
df.transcription=df.transcription.astype(str)

In [7]:
# remove rows with missing values
def clean_df(data):
    df = data.dropna().drop_duplicates() 
    # drop rows where medical specialty appears less than 100 times
    df = df[df.groupby("medical_specialty")["medical_specialty"].transform('size') > 100]
    print("Number of rows after removing medical specialties with less than 100 samples:", len(df.index))
    # remove unnecessary columns, only keep transcriptions and medical_specialty columns
    return df[['transcription', 'medical_specialty']]


# remove punctuation and lowercase and lemmatizer
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def remove_punct_lower(data):
    '''remove punctuation and lowercase'''
    data["transcription_c"] = data["transcription"].apply(lambda x: x.lower().translate(str.maketrans('','', string.punctuation)))
    return data

def lemmatize_words(data):
    '''lemmatize words, remove stopwords'''
    data["transcription_c"] = data["transcription_c"].apply(lambda x: [lemmatizer.lemmatize(x) for x in word_tokenize(x) if x not in (stop)])
    return data
 


#apply on dataset
df_m = clean_df(df)
df_test = remove_punct_lower(df_m)
df_test = lemmatize_words(df_test)


df_test.head(2)

Number of rows after removing medical specialties with less than 100 samples: 2976


Unnamed: 0,transcription,medical_specialty,transcription_c
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,"[2d, mmode, 1, left, atrial, enlargement, left..."
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,"[1, left, ventricular, cavity, size, wall, thi..."


In [8]:
# NLP with Spacy
import spacy
import en_ner_bionlp13cg_md
nlp = en_ner_bionlp13cg_md.load()
def medical_entities( text):
    entities = []
    doc = nlp(text)
    for ent in doc.ents:
        entities.append(ent.text)
    return set(entities)#' ,'.join(entities)


df_test['transcription_c'] = [','.join(map(str, l)) for l in df_test['transcription_c']]
df_test['transcription_f'] = df_test['transcription_c'].apply(medical_entities)
df_test.head()

Unnamed: 0,transcription,medical_specialty,transcription_c,transcription_f
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,"2d,mmode,1,left,atrial,enlargement,left,atrial...","{valve, ventricular, mitral, left, pulmonary}"
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,"1,left,ventricular,cavity,size,wall,thickness,...","{wall, ventricle, valve, leaflet, artery, root..."
7,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary,"2d,echocardiogrammultiple,view,heart,great,ves...","{arch, venous, valve, septum, artery, vessel, ..."
9,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary,"description1,normal,cardiac,chamber,size2,norm...","{valve, cardiac, ventricular, mitral, left}"
11,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary,"2d,study1,mild,aortic,stenosis,widely,calcifie...","{ventricle, heart, ventricular, mitral, left}"


In [12]:
# retrieve labels as function
def get_labels(data):
    return data['medical_specialty'].tolist()

df_test_label = get_labels(df_test)

df_test_X = df_test['transcription_f'].astype(str)

In [11]:
# split data into train and test set as function
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df_test_X, df_test_label)

In [71]:
# pca for dimensionality reduction
def pca(data):
    pca = PCA(n_components=0.95)
    X_reduced = pca.fit_transform(data.toarray())
    return X_reduced

In [78]:
model_pipeline = imbPipeline([
        ('preprocessing',CountVectorizer()),
        ('svd', TruncatedSVD(n_components=100)),
        #('pca', FunctionTransformer(pca)),
        ('smote', SMOTE(random_state=42)),
        ('classifier', LogisticRegression(random_state=42, multi_class='multinomial')), # remainder="passthrough"
])

In [79]:
lr = model_pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [80]:
y_pred = lr.predict(X_test)

category_list = df_test.medical_specialty.unique()
# predict and evaluate
print(classification_report(y_test, y_pred, target_names=category_list))


                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.43      0.52      0.47        52
                       Urology       0.21      0.17      0.19        40
              General Medicine       0.41      0.45      0.43        42
                       Surgery       0.09      0.12      0.11        25
 SOAP / Chart / Progress Notes       0.20      0.35      0.25        23
                     Radiology       0.31      0.57      0.40        30
                    Orthopedic       0.38      0.53      0.44        57
       Obstetrics / Gynecology       0.13      0.16      0.14        51
                     Neurology       0.21      0.26      0.23        35
              Gastroenterology       0.59      0.29      0.39       212
    Consult - History and Phy.       0.35      0.48      0.41        29

                      accuracy                           0.34       596
                     macro avg       0.30      0.35      0.31 

In [81]:
param_grid = [
    { 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
      'classifier': [LogisticRegression(multi_class='multinomial', random_state=42)],
      'classifier__solver': ['saga', 'lbfgs', 'liblinear'],
      'classifier__penalty': ['none', 'l1', 'l2', 'elasticnet'],
    }
]

search = GridSearchCV(model_pipeline, param_grid, cv=5)
search.fit(X_train, y_train)
search.best_params_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'classifier': LogisticRegression(C=0.01, multi_class='multinomial', random_state=42),
 'classifier__C': 0.01,
 'classifier__penalty': 'l2',
 'classifier__solver': 'lbfgs'}

In [82]:
# take best model from grid search and perform evaluation
# grid search as function and return best model
def grid_search(X_train, y_train, model_pipeline, param_grid):
    search = GridSearchCV(model_pipeline, param_grid, cv=5)
    search.fit(X_train, y_train)
    return search.best_estimator_
    
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

category_list = df_test.medical_specialty.unique()
# predict and evaluate
print(classification_report(y_test, y_pred, target_names=category_list))


                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.47      0.63      0.54        52
                       Urology       0.22      0.15      0.18        40
              General Medicine       0.44      0.57      0.50        42
                       Surgery       0.19      0.28      0.23        25
 SOAP / Chart / Progress Notes       0.36      0.57      0.44        23
                     Radiology       0.37      0.60      0.46        30
                    Orthopedic       0.40      0.65      0.49        57
       Obstetrics / Gynecology       0.22      0.22      0.22        51
                     Neurology       0.19      0.26      0.22        35
              Gastroenterology       0.66      0.28      0.39       212
    Consult - History and Phy.       0.42      0.66      0.51        29

                      accuracy                           0.40       596
                     macro avg       0.36      0.44      0.38 

In [8]:
# vectorize transcription to receive sparse matrix
def vectorize(data):
    vectorizer = CountVectorizer().fit_transform(data)
    return vectorizer
df_test_X_vectorized = vectorize(df_test_X)
df_test_X_vectorized

<2976x2702 sparse matrix of type '<class 'numpy.int64'>'
	with 50796 stored elements in Compressed Sparse Row format>

In [9]:
# balance out dataset with SMOTE, creates synthetic samples of the minority classes as function
def smote(X, y):
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    return X_res, y_res


X_res, y_res = smote(df_test_X_vectorized, df_test_label)
X_res, y_res

(<11253x2702 sparse matrix of type '<class 'numpy.int64'>'
 	with 205251 stored elements in Compressed Sparse Row format>,
 [' Cardiovascular / Pulmonary',
  ' Cardiovascular / Pulmonary',
  ' Cardiovascular / Pulmonary',
  ' Cardiovascular / Pulmonary',
  ' Cardiovascular / Pulmonary',
  ' Cardiovascular / Pulmonary',
  ' Urology',
  ' General Medicine',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology',
  ' Urology'

In [10]:
# pca for dimensionality reduction
def pca(data):
    pca = PCA(n_components=0.95)
    X_reduced = pca.fit_transform(data.toarray())
    return X_reduced

df_test_X_pca = pca(df_test_X_vectorized)

In [27]:
def build_preprocessing_pipeline(data):
    return Pipeline([
        ('vectorize', FunctionTransformer(vectorize, validate=False))
    ])

X_train_prepared = build_preprocessing_pipeline(X_train)

def build_preprocessing_pipeline_2():
    return Pipeline([
        ('vectorize', CountVectorizer())
    ])
X_train_prepared_2 = build_preprocessing_pipeline_2().fit_transform(X_train)
X_train_prepared_2

<2380x2574 sparse matrix of type '<class 'numpy.int64'>'
	with 40649 stored elements in Compressed Sparse Row format>