In [1]:
# !cd NLP_Masterthesis
!ls

NLP_Masterthesis  quick_start_advanced.ipynb  quick_start_images
README.md	  quick_start_beginner.ipynb


In [None]:
# !pip install scikit-learn
!pip install imblearn

!pip install nltk
# !pip install spacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz

In [38]:
## Import Libraries

import pandas as pd
import numpy as np
import string
import re
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline as skPipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.manifold import TSNE    

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from imblearn.over_sampling import SMOTE

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
# load data
df = pd.read_csv('./NLP_Masterthesis/data/raw/mtsamples.csv')
df.transcription=df.transcription.astype(str)

In [6]:
# remove rows with missing values
def clean_df(data):
    df = data.dropna().drop_duplicates() 
    # drop rows where medical specialty appears less than 100 times
    df = df[df.groupby("medical_specialty")["medical_specialty"].transform('size') > 100]
    print("Number of rows after removing medical specialties with less than 100 samples:", len(df.index))
    # remove unnecessary columns, only keep transcriptions and medical_specialty columns
    return df[['transcription', 'medical_specialty']]


# remove punctuation and lowercase and lemmatizer
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def remove_punct_lower(data):
    '''remove punctuation and lowercase'''
    data["transcription_c"] = data["transcription"].apply(lambda x: x.lower().translate(str.maketrans('','', string.punctuation)))
    return data

def lemmatize_words(data):
    '''lemmatize words, remove stopwords'''
    data["transcription_c"] = data["transcription_c"].apply(lambda x: [lemmatizer.lemmatize(x) for x in word_tokenize(x) if x not in (stop)])
    return data
 


#apply on dataset
df_m = clean_df(df)
df_test = remove_punct_lower(df_m)
df_test = lemmatize_words(df_test)


df_test.head(2)

Number of rows after removing medical specialties with less than 100 samples: 2976


Unnamed: 0,transcription,medical_specialty,transcription_c
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,"[2d, mmode, 1, left, atrial, enlargement, left..."
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,"[1, left, ventricular, cavity, size, wall, thi..."


In [7]:
# NLP with Spacy
import spacy
import en_ner_bionlp13cg_md
nlp = en_ner_bionlp13cg_md.load()

In [8]:
def medical_entities( text):
    entities = []
    doc = nlp(text)
    for ent in doc.ents:
        entities.append(ent.text)
    return set(entities)#' ,'.join(entities)


df_test['transcription_c'] = [','.join(map(str, l)) for l in df_test['transcription_c']]
df_test['transcription_f'] = df_test['transcription_c'].apply(medical_entities)
df_test.head()

Unnamed: 0,transcription,medical_specialty,transcription_c,transcription_f
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,"2d,mmode,1,left,atrial,enlargement,left,atrial...","{ventricular, mitral, pulmonary, left, valve}"
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,"1,left,ventricular,cavity,size,wall,thickness,...","{ventricle, artery, interatrial, leaflet, root..."
7,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary,"2d,echocardiogrammultiple,view,heart,great,ves...","{cardiac, vessel, artery, heart, interatrial, ..."
9,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary,"description1,normal,cardiac,chamber,size2,norm...","{cardiac, ventricular, mitral, left, valve}"
11,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary,"2d,study1,mild,aortic,stenosis,widely,calcifie...","{ventricle, heart, ventricular, mitral, left}"


In [9]:
# retrieve labels as function
def get_labels(data):
    return data['medical_specialty'].tolist()

df_test_label = get_labels(df_test)

df_test_X = df_test['transcription_f'].astype(str)

In [10]:
# split data into train and test set as function
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df_test_X, df_test_label)

In [52]:
preprocessing_pipeline =Pipeline([
        ('vectorize', CountVectorizer())
    ])

X_train_prepared = preprocessing_pipeline.fit_transform(X_train)

X_train_prepared

<2380x2574 sparse matrix of type '<class 'numpy.int64'>'
	with 40649 stored elements in Compressed Sparse Row format>

In [67]:
model_pipelilne = imbPipeline([
        ('preprocessing',CountVectorizer()),
                # ('pca', PCA(n_components=0.95)),

        ('smote', SMOTE(random_state=42)),
        ('classifier', LogisticRegression(random_state=42, multi_class='multinomial')), # remainder="passthrough"
])

In [68]:
# textclassifier = imbPipeline([
#   ('vect', CountVectorizer()),
# #    ('tfidf', TfidfTransformer()),
#    ('smote', SMOTE(random_state=12)),
#    ('pca', PCA(n_components=0.95))
# #    ('mnb', MultinomialNB(alpha =0.1))
# ])

# textclassifier.fit(X_train, y_train)

In [70]:
param_grid = [
    { 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
      'classifier': [LogisticRegression(multi_class='multinomial', random_state=42)],
      'classifier__solver': ['saga', 'lbfgs', 'liblinear'],
      'classifier__penalty': ['none', 'l1', 'l2', 'elasticnet'],
    }
]

model_pipelilne.fit(X_train, y_train)
# pipe.fit(X_train, y_train)

#search = GridSearchCV(pipe, param_grid, cv=5)
#search.fit(X_train, y_train)
#search.best_params_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
