# Imports and load data

In [1]:
#imports
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer#, ColumnTransformer

from sklearn.compose import ColumnTransformer

[nltk_data] Downloading package stopwords to /Users/tara-
[nltk_data]     sophiatumbraegel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/tara-
[nltk_data]     sophiatumbraegel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/tara-
[nltk_data]     sophiatumbraegel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/tara-
[nltk_data]     sophiatumbraegel/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# load data
df = pd.read_csv('../data/raw/mtsamples.csv')
df.transcription=df.transcription.astype(str)
#print(df.columns)
df.head(15)

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."
5,5,Morbid obesity. Laparoscopic antecolic anteg...,Bariatrics,Laparoscopic Gastric Bypass,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...","bariatrics, gastric bypass, eea anastomosis, r..."
6,6,"Liposuction of the supraumbilical abdomen, re...",Bariatrics,Liposuction,"PREOPERATIVE DIAGNOSES:,1. Deformity, right b...","bariatrics, breast reconstruction, excess, lma..."
7,7,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 3,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","cardiovascular / pulmonary, 2-d echocardiogram..."
8,8,Suction-assisted lipectomy - lipodystrophy of...,Bariatrics,Lipectomy - Abdomen/Thighs,"PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...","bariatrics, lipodystrophy, abd pads, suction-a..."
9,9,Echocardiogram and Doppler,Cardiovascular / Pulmonary,2-D Echocardiogram - 4,"DESCRIPTION:,1. Normal cardiac chambers size....","cardiovascular / pulmonary, ejection fraction,..."


# Clean and Preprocess the data

In [19]:

# remove rows with missing values
def clean_df(data):
    df = data.dropna().drop_duplicates() 
    # drop rows where medical specialty appears less than 50 times
    df = df[df.groupby("medical_specialty")["medical_specialty"].transform('size') > 50]
    print("Number of rows after removing medical specialties with less than 50 samples:", len(df.index))
    # remove unnecessary columns, only keep transcriptions and medical_specialty columns
    return df[['transcription', 'medical_specialty']]


# remove punctuation and lowercase and lemmatizer
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def remove_punct_lower(data):
    '''remove punctuation and lowercase'''
    data["transcription_c"] = data["transcription"].apply(lambda x: x.lower().translate(str.maketrans('','', string.punctuation)))
    return data

def lemmatize_words(data):
    '''lemmatize words, remove stopwords'''
    data["transcription_c"] = data["transcription_c"].apply(lambda x: [lemmatizer.lemmatize(x) for x in word_tokenize(x) if x not in (stop)])
    return data
 


#apply on dataset
df_m = clean_df(df)
df_test = remove_punct_lower(df_m)
df_cleaned = lemmatize_words(df_test)


df_cleaned.head(2)


Number of rows after removing medical specialties with less than 50 samples: 3546


Unnamed: 0,transcription,medical_specialty,transcription_c
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary,"[2d, mmode, 1, left, atrial, enlargement, left..."
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary,"[1, left, ventricular, cavity, size, wall, thi..."


# NLP with Spacy


In [None]:
# install
#!python -m spacy download en_ner_bionlp13cg_md
# !pip3 install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz

#!pip3 install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
#!pip3 install ../data/en_ner_bionlp13cg_md-0.5.1

In [3]:
df_small = df.head(100)
pd.set_option('max_colwidth', None)  # Remove any limitation on length 
                                     # of text displayed in a cell
#pd.set_option('max_rows', 300)  # Display up to 300 rows in a dataset

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with complaint of allergies.,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to have allergies when she lived in Seattle but she thinks they are worse here. In the past, she has tried Claritin, and Zyrtec. Both worked for short time but then seemed to lose effectiveness. She has used Allegra also. She used that last summer and she began using it again two weeks ago. It does not appear to be working very well. She has used over-the-counter sprays but no prescription nasal sprays. She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals: Weight was 130 pounds and blood pressure 124/78.,HEENT: Her throat was mildly erythematous without exudate. Nasal mucosa was erythematous and swollen. Only clear drainage was seen. TMs were clear.,Neck: Supple without adenopathy.,Lungs: Clear.,ASSESSMENT:, Allergic rhinitis.,PLAN:,1. She will try Zyrtec instead of Allegra again. Another option will be to use loratadine. She does not think she has prescription coverage so that might be cheaper.,2. Samples of Nasonex two sprays in each nostril given for three weeks. A prescription was written as well.","allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegra, sprays, allergic,"
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to public seating, and lifting objects off the floor. He exercises three times a week at home and does cardio. He has difficulty walking two blocks or five flights of stairs. Difficulty with snoring. He has muscle and joint pains including knee pain, back pain, foot and ankle pain, and swelling. He has gastroesophageal reflux disease.,PAST SURGICAL HISTORY:, Includes reconstructive surgery on his right hand 13 years ago. ,SOCIAL HISTORY:, He is currently single. He has about ten drinks a year. He had smoked significantly up until several months ago. He now smokes less than three cigarettes a day.,FAMILY HISTORY:, Heart disease in both grandfathers, grandmother with stroke, and a grandmother with diabetes. Denies obesity and hypertension in other family members.,CURRENT MEDICATIONS:, None.,ALLERGIES:, He is allergic to Penicillin.,MISCELLANEOUS/EATING HISTORY:, He has been going to support groups for seven months with Lynn Holmberg in Greenwich and he is from Eastchester, New York and he feels that we are the appropriate program. He had a poor experience with the Greenwich program. Eating history, he is not an emotional eater. Does not like sweets. He likes big portions and carbohydrates. He likes chicken and not steak. He currently weighs 312 pounds. Ideal body weight would be 170 pounds. He is 142 pounds overweight. If ,he lost 60% of his excess body weight that would be 84 pounds and he should weigh about 228.,REVIEW OF SYSTEMS: ,Negative for head, neck, heart, lungs, GI, GU, orthopedic, and skin. Specifically denies chest pain, heart attack, coronary artery disease, congestive heart failure, arrhythmia, atrial fibrillation, pacemaker, high cholesterol, pulmonary embolism, high blood pressure, CVA, venous insufficiency, thrombophlebitis, asthma, shortness of breath, COPD, emphysema, sleep apnea, diabetes, leg and foot swelling, osteoarthritis, rheumatoid arthritis, hiatal hernia, peptic ulcer disease, gallstones, infected gallbladder, pancreatitis, fatty liver, hepatitis, hemorrhoids, rectal bleeding, polyps, incontinence of stool, urinary stress incontinence, or cancer. Denies cellulitis, pseudotumor cerebri, meningitis, or encephalitis.,PHYSICAL EXAMINATION:, He is alert and oriented x 3. Cranial nerves II-XII are intact. Afebrile. Vital Signs are stable.","bariatrics, laparoscopic gastric bypass, weight loss programs, gastric bypass, atkin's diet, weight watcher's, body weight, laparoscopic gastric, weight loss, pounds, months, weight, laparoscopic, band, loss, diets, overweight, lost"


In [3]:
# NLP with Spacy
import spacy
import en_ner_bionlp13cg_md
nlp = en_ner_bionlp13cg_md.load()
def medical_entities( text):
    entities = []
    doc = nlp(text)
    for ent in doc.ents:
        entities.append(ent.text)
    return set(entities)#' ,'.join(entities)


df_small['transcription_c'] = [','.join(map(str, l)) for l in df_small['transcription_c']]
df_small['transcription_f'] = df_small['transcription_c'].apply(medical_entities)
df_small.head()

NameError: name 'df_small' is not defined

In [5]:
df = df.dropna().drop_duplicates() 
    # drop rows where medical specialty appears less than 50 times
df = df[df.groupby("medical_specialty")["medical_specialty"].transform('size') > 100]
df = df[['transcription', 'medical_specialty']]
df_small = df.head(100)
user_input = [' hello I am a patent, and I am sick, nasal issues, also headach and my mum and dad huhu hehe']

#create new df 
df_user = pd.DataFrame({'transcription':user_input})

Unnamed: 0,transcription,medical_specialty
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary
7,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary
9,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary
11,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary


In [6]:
import spacy
import en_ner_bionlp13cg_md
# remove rows with missing values

def clean_df(data):
    df = data.dropna().drop_duplicates() 
    # drop rows where medical specialty appears less than 50 times
    df = df[df.groupby("medical_specialty")["medical_specialty"].transform('size') > 100]
    print("Number of rows after removing medical specialties with less than 50 samples:", len(df.index))
    # remove unnecessary columns, only keep transcriptions and medical_specialty columns
    return df[['transcription', 'medical_specialty']]


# remove punctuation and lowercase and lemmatizer

def remove_punct_lower(data):
    '''remove punctuation and lowercase'''
    data["transcription"] = data["transcription"].apply(lambda x: x.lower().translate(str.maketrans('','', string.punctuation)))
    return data

def lemmatize_words(data):
    '''lemmatize words, remove stopwords'''
    # remove punctuation and lowercase and lemmatizer
    stop = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    data["transcription"] = data["transcription"].apply(lambda x: [lemmatizer.lemmatize(x) for x in word_tokenize(x) if x not in (stop)])
    return data
 
def list_to_string_f(data):
    data["transcription"] = [','.join(map(str, l)) for l in data['transcription']]
    return data

    #str1 = " " 
    #return (str1.join(s))

def nlp_model(data):
    #data["transcription_text"] = [','.join(map(str, l)) for l in data['transcription']]

# NLP with Spacy

    def medical_entities( text):
        nlp = en_ner_bionlp13cg_md.load()
        entities = []
        doc = nlp(text)
        for ent in doc.ents:
            entities.append(ent.text)
        return set(entities)
    data['transcription_final'] = data['transcription'].apply(medical_entities)
    return data
    
    #return set(entities)#' ,'.join(entities)


#df_small['transcription_c'] = [','.join(map(str, l)) for l in df_small['transcription_c']]
#df_small['transcription_f'] = df_small['transcription_c'].apply(medical_entities)
#df_small.head()
df_rpl = remove_punct_lower(df_small)
lem_df = lemmatize_words(df_rpl)
ls = list_to_string_f(lem_df)
nl =nlp_model(ls)
nl.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["transcription"] = data["transcription"].apply(lambda x: x.lower().translate(str.maketrans('','', string.punctuation)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["transcription"] = data["transcription"].apply(lambda x: [lemmatizer.lemmatize(x) for x in word_tokenize(x) if x not in (stop)])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,transcription,medical_specialty,transcription_final
3,"2d,mmode,1,left,atrial,enlargement,left,atrial...",Cardiovascular / Pulmonary,"{left, ventricular, pulmonary, valve, mitral}"


In [7]:
nl.head()

Unnamed: 0,transcription,medical_specialty,transcription_final
3,"2d,mmode,1,left,atrial,enlargement,left,atrial...",Cardiovascular / Pulmonary,"{left, ventricular, pulmonary, valve, mitral}"
4,"1,left,ventricular,cavity,size,wall,thickness,...",Cardiovascular / Pulmonary,"{ventricular, left, ventricle, lipomatous, val..."
7,"2d,echocardiogrammultiple,view,heart,great,ves...",Cardiovascular / Pulmonary,"{heart, venous, aorta, inflow, coronary, left,..."
9,"description1,normal,cardiac,chamber,size2,norm...",Cardiovascular / Pulmonary,"{left, ventricular, cardiac, valve, mitral}"
11,"2d,study1,mild,aortic,stenosis,widely,calcifie...",Cardiovascular / Pulmonary,"{heart, left, ventricular, ventricle, mitral}"


In [11]:
nl = nl.rename(columns={'transcription': 'original_transcription', 'transcription_final': 'transcription'})
nl.head()

Unnamed: 0,original_transcription,medical_specialty,transcription
3,"2d,mmode,1,left,atrial,enlargement,left,atrial,diameter,47,cm2,normal,size,right,left,ventricle3,normal,lv,systolic,function,left,ventricular,ejection,fraction,514,normal,lv,diastolic,function5,pericardial,effusion6,normal,morphology,aortic,valve,mitral,valve,tricuspid,valve,pulmonary,valve7,pa,systolic,pressure,36,mmhgdoppler,1,mild,mitral,tricuspid,regurgitation2,trace,aortic,pulmonary,regurgitation",Cardiovascular / Pulmonary,"{left, mitral, ventricular, pulmonary, valve}"
4,"1,left,ventricular,cavity,size,wall,thickness,appear,normal,wall,motion,left,ventricular,systolic,function,appears,hyperdynamic,estimated,ejection,fraction,70,75,nearcavity,obliteration,seen,also,appears,increased,left,ventricular,outflow,tract,gradient,mid,cavity,level,consistent,hyperdynamic,left,ventricular,systolic,function,abnormal,left,ventricular,relaxation,pattern,seen,well,elevated,left,atrial,pressure,seen,doppler,examination2,left,atrium,appears,mildly,dilated3,right,atrium,right,ventricle,appear,normal4,aortic,root,appears,normal5,aortic,valve,appears,calcified,mild,aortic,valve,stenosis,calculated,aortic,valve,area,13,cm,square,maximum,instantaneous,gradient,34,mean,gradient,19,mm6,mitral,annular,calcification,extending,leaflet,supportive,structure,thickening,mitral,valve,leaflet,mild,mitral,regurgitation7,tricuspid,valve,appears,normal,trace,tricuspid,regurgitation,moderate,pulmonary,artery,hypertension,estimated,pulmonary,artery,systolic,pressure,49,mmhg,estimated,right,atrial,pressure,10,mmhg8,pulmonary,valve,appears,normal,trace,pulmonary,insufficiency9,pericardial,effusion,intracardiac,mass,seen10,color,doppler,suggestive,patent,foramen,ovale,lipomatous,hypertrophy,interatrial,septum11,study,somewhat,technically,limited,hence,subtle,abnormality,could,missed,study",Cardiovascular / Pulmonary,"{leaflet, left, mitral, lipomatous, ventricular, pulmonary, interatrial, root, ventricle, atrium, valve, artery, wall}"
7,"2d,echocardiogrammultiple,view,heart,great,vessel,reveal,normal,intracardiac,great,vessel,relationship,cardiac,function,normal,significant,chamber,enlargement,hypertrophy,pericardial,effusion,vegetation,seen,doppler,interrogation,including,color,flow,imaging,reveals,systemic,venous,return,right,atrium,normal,tricuspid,inflow,pulmonary,outflow,normal,valve,pulmonary,venous,return,left,atrium,interatrial,septum,intact,mitral,inflow,ascending,aorta,flow,normal,aortic,valve,trileaflet,coronary,artery,appear,normal,origin,aortic,arch,leftsided,patent,normal,descending,aorta,pulsatility",Cardiovascular / Pulmonary,"{inflow, left, mitral, arch, pulmonary, interatrial, septum, venous, heart, vessel, atrium, valve, coronary, aorta, artery, cardiac}"
9,"description1,normal,cardiac,chamber,size2,normal,left,ventricular,size3,normal,lv,systolic,function,ejection,fraction,estimated,around,604,aortic,valve,seen,good,motion5,mitral,valve,seen,good,motion6,tricuspid,valve,seen,good,motion7,pericardial,effusion,intracardiac,massesdoppler1,trace,mitral,regurgitation2,trace,tricuspid,regurgitationimpression1,normal,lv,systolic,function2,ejection,fraction,estimated,around,60",Cardiovascular / Pulmonary,"{left, mitral, ventricular, valve, cardiac}"
11,"2d,study1,mild,aortic,stenosis,widely,calcified,minimally,restricted2,mild,left,ventricular,hypertrophy,normal,systolic,function3,moderate,biatrial,enlargement4,normal,right,ventricle5,normal,appearance,tricuspid,mitral,valves6,normal,left,ventricle,left,ventricular,systolic,functiondoppler1,1,2,aortic,regurgitation,easily,seen,aortic,stenosis2,mild,tricuspid,regurgitation,mild,increase,right,heart,pressure,3035,mmhg,maximumsummary1,normal,left,ventricle2,moderate,biatrial,enlargement3,mild,tricuspid,regurgitation,mild,increase,right,heart,pressure",Cardiovascular / Pulmonary,"{left, mitral, ventricular, heart, ventricle}"


In [12]:
nl.to_csv('../data/processed/mtsamples_nlp.csv', index = False)

# pipeline

In [45]:



#clean_data = FunctionTransformer(clean_df, validate=False)
remove_punctation = FunctionTransformer(remove_punct_lower, validate=False)
lemmatize_thewords = FunctionTransformer(lemmatize_words, validate=False)
liststring = FunctionTransformer(list_to_string_f, validate=False)
nlp_model_final_function = FunctionTransformer(nlp_model, validate=False)


nlp = en_ner_bionlp13cg_md.load()
pl = Pipeline(memory=None,
    steps=[
       # ('cleandata', clean_data),
        ('removepunct', remove_punctation),
        ('lemmatize', lemmatize_thewords )#,
       ('list_to_stringf', list_to_string_f),
       ('modelf', nlp_model_final_function)
    ], verbose=False)


x = pl.fit_transform(df_small)
#pd.set_option('max_colwidth', None)  # Remove any limitation on length 
                                     # of text displayed in a cell
#pd.set_option('max_rows', 300)  # Display up to 300 rows in a dataset
x.head()

['transcription', 'medicalspecialty']


TypeError: 'NoneType' object is not subscriptable

#NLP with YAKE
https://towardsdatascience.com/keyword-extraction-methods-the-overview-35557350f8bb
https://medium.com/@galeopsi/getting-started-nlp-9955b2cdba8c


In [5]:
y = pl.transform(df_small)
x

Number of rows after removing medical specialties with less than 50 samples: 142


# Yake


The duplication_threshold variable is limit the duplication of words in different keywords. You can set the deduplication_threshold value to 0.1 to avoid the repetition of words in keywords. If you set the deduplication_threshold value to 0.9, then repetition of words is allowed in keywords

In [10]:
import spacy
import en_ner_bionlp13cg_md
# remove rows with missing values
def clean_df(data):
    df = data.dropna().drop_duplicates() 
    # drop rows where medical specialty appears less than 50 times
    df = df[df.groupby("medical_specialty")["medical_specialty"].transform('size') > 50]
    print("Number of rows after removing medical specialties with less than 50 samples:", len(df.index))
    # remove unnecessary columns, only keep transcriptions and medical_specialty columns
    return df[['transcription', 'medical_specialty']]


# remove punctuation and lowercase and lemmatizer

def remove_punct_lower(data):
    '''remove punctuation and lowercase'''
    data["transcription"] = data["transcription"].apply(lambda x: x.lower().translate(str.maketrans('','', string.punctuation)))
    return data

def lemmatize_words(data):
    '''lemmatize words, remove stopwords'''
    # remove punctuation and lowercase and lemmatizer
    stop = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    data["transcription"] = data["transcription"].apply(lambda x: [lemmatizer.lemmatize(x) for x in word_tokenize(x) if x not in (stop)])
    return data
 
def list_to_string_f(data):
    data["transcription"] = [' '.join(map(str, l)) for l in data['transcription']]
    return data

# NLP with Spacy

def medical_entities( text):
    nlp = en_ner_bionlp13cg_md.load()
    entities = []
    doc = nlp(text)
    for ent in doc.ents:
        entities.append(ent.text)
    return set(entities)

def nlp_model(data):
    data['transcription_final'] = data['transcription'].apply(medical_entities)
    return data
    
    #return set(entities)#' ,'.join(entities)

df_small = df.head(200)

clean_data = FunctionTransformer(clean_df, validate=False)
remove_punctation = FunctionTransformer(remove_punct_lower, validate=False)
lemmatize_thewords = FunctionTransformer(lemmatize_words, validate=False)
list_to_string_words = FunctionTransformer(list_to_string_f, validate=False)
nlp_model_final_function = FunctionTransformer(nlp_model, validate=False)


pl = Pipeline(memory=None,
    steps=[
        ('cleandata', clean_data),
        ('removepunct', remove_punctation),
        ('lemmatize', lemmatize_thewords ),
       ('list_to_stringf', list_to_string_words)#,
      #  ('modelf', nlp_model_final_function)
    ], verbose=False)


x = pl.fit_transform(df_small)
x.head()

Number of rows after removing medical specialties with less than 50 samples: 192


Unnamed: 0,transcription,medical_specialty
18,procedure elective male sterilization via bila...,Urology
20,indication prostate cancertechnique 35 hour fo...,Urology
22,description patient placed supine position pre...,Urology
23,preoperative diagnosis voluntary sterilitypost...,Urology
25,diagnosis desire vasectomyname operation vasec...,Urology


In [11]:
import yake
def yake_function(text):
    entities = []
    kw_extractor = yake.KeywordExtractor()
    language = "en"
    max_ngram_size = 3
    deduplication_threshold = 0.9
    numOfKeywords = 20
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    for kw in keywords:
        entities.append(kw[0])
        #print(kw)    
    return set(entities)

def yake_model(data):
    data['transcription_final'] = data['transcription'].apply(yake_function)
    return data

#y = FunctionTransformer(yake_model, validate=False)
y = yake_model(x)
y.head()

Unnamed: 0,transcription,medical_specialty,transcription_final
18,procedure elective male sterilization via bila...,Urology,"{segment removed free, removed free end, sutur..."
20,indication prostate cancertechnique 35 hour fo...,Urology,"{normal limit kidney, indication prostate canc..."
22,description patient placed supine position pre...,Urology,"{anesthetized local anesthesia, chromic catgut..."
23,preoperative diagnosis voluntary sterilitypost...,Urology,"{fulgurated meticulous hemostasis, deferens sk..."
25,diagnosis desire vasectomyname operation vasec...,Urology,"{vasectomyanesthesia generalhistory patient, c..."


In [25]:
from rake_nltk import Rake
def rake_nltk_function(text):
    rake_nltk_var = Rake()
    entities = []
    rake_nltk_var.extract_keywords_from_text(text)
    keywords_scored = rake_nltk_var.get_word_degrees()
    #keyword_extracted = rake_nltk_var.get_ranked_phrases()
    for key in keywords_scored:
        entities.append(key)
    return set(entities)

def rake_model(data):
    data['transcription_final'] = data['transcription'].apply(rake_nltk_function)
    return data

t = rake_model(x)
pd.set_option('max_colwidth', None)  # Remove any limitation on length 
                                     # of text displayed in a cell
pd.set_option('max_rows', 300)  # Display up to 300 rows in a dataset
t.head()

OptionError: Pattern matched multiple keys

# pipeline with custom input

In [29]:
user_input = [' hello I am a patent, and I am sick, nasal issues, also headach and my mum and dad huhu hehe']

#create new df 
df_user = pd.DataFrame({'transcription':user_input})
df_user

Unnamed: 0,transcription
0,"hello I am a patent, and I am sick, nasal issues, also headach and my mum and dad huhu hehe"


In [21]:
df = df.dropna().drop_duplicates() 
    # drop rows where medical specialty appears less than 50 times
df = df[df.groupby("medical_specialty")["medical_specialty"].transform('size') > 100]
df_small = df.head(100)

def remove_punct_lower(series_input):
    '''remove punctuation and lowercase'''
    print(series_input.apply(lambda x: x.lower().translate(str.maketrans('','', string.punctuation))))
    return series_input.apply(lambda x: x.lower().translate(str.maketrans('','', string.punctuation)))


In [28]:
def remove_punct_lower(series_input):
    '''remove punctuation and lowercase'''
    print(list(map(lambda x: x.lower().translate(str.maketrans('','', string.punctuation)),series_input )))
    #return series_input.apply(lambda x: x.lower().translate(str.maketrans('','', string.punctuation)))
inp = [' hello I am a patent, and I am sick, nasal issues, also headach and my mum and dad huhu hehe']
remove_punct_lower(inp)


#l = [int(x) for x in l]

#l = list(map(int,l))

[' hello i am a patent and i am sick nasal issues also headach and my mum and dad huhu hehe']


In [23]:
import spacy
import en_ner_bionlp13cg_md


# remove punctuation and lowercase and lemmatizer

def remove_punct_lower(series_input):
    '''remove punctuation and lowercase'''
    print(series_input.apply(lambda x: x.lower().translate(str.maketrans('','', string.punctuation))))
    return series_input.apply(lambda x: x.lower().translate(str.maketrans('','', string.punctuation)))

def lemmatize_words(series_input):
    '''lemmatize words, remove stopwords'''
    # remove punctuation and lowercase and lemmatizer
    stop = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    return series_input.apply(lambda x: [lemmatizer.lemmatize(x) for x in word_tokenize(x) if x not in (stop)])
 
def list_to_string_f(series_input):
    return [' '.join(map(str, l)) for l in series_input]


def medical_entities( series_input):
    nlp = en_ner_bionlp13cg_md.load()
    entities = []
    doc = nlp(series_input)
    for ent in doc.ents:
        entities.append(ent.series_input)
    return set(entities)

def nlp_model(data):
    data['transcription_final'] = data['transcription'].apply(medical_entities)
    return data
    
    #return set(entities)#' ,'.join(entities)

#df_small = df.head(200)

#clean_data = FunctionTransformer(clean_df, validate=False)
remove_punctation = ColumnTransformer(remove_punct_lower)
lemmatize_thewords = ColumnTransformer(lemmatize_words)#, validate=False)
list_to_string_words = ColumnTransformer(list_to_string_f)#, validate=False)
nlp_model_final_function = ColumnTransformer(medical_entities)#, validate=False)


pl = Pipeline(memory=None,
    steps=[
        #(
            #'cleandata', clean_data),
        ('removepunct', remove_punctation),
        ('lemmatize', lemmatize_thewords ),
       ('list_to_stringf', list_to_string_words),
       ('modelf', nlp_model_final_function)
    ], verbose=False)

inp = [[' hello I am a patent, and I am sick, nasal issues, also headach and my mum and dad huhu hehe']]
#inp = inp.reshape(-1, 1)
x = pl.fit_transform(df.transcription)
x

TypeError: zip() argument after * must be an iterable, not function