**Natural Language Entity Extraction from Medical Reports**

In [1]:
!pip install rake-nltk



In [4]:
from rake_nltk import Rake
import numpy as np
import pandas as pd

In [9]:
notes = pd.read_csv("/content/patient_notes.csv")

In [10]:
notes.head()

Unnamed: 0,pn_num,case_num,pn_history
0,0,0.0,"17-year-old male, has come to the student heal..."
1,1,0.0,17 yo male with recurrent palpitations for the...
2,2,0.0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0.0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0.0,17yo male with no pmh here for evaluation of p...


In [11]:
notes.drop(["pn_num", "case_num"], axis=1, inplace=True)

In [12]:
notes["pn_history"]

0       17-year-old male, has come to the student heal...
1       17 yo male with recurrent palpitations for the...
2       Dillon Cleveland is a 17 y.o. male patient wit...
3       a 17 yo m c/o palpitation started 3 mos ago; \...
4       17yo male with no pmh here for evaluation of p...
                              ...                        
7550    Mr. Hamilton is  a 35 yr old presenting w/ con...
7551    35yo M presents with complaints of stomach pro...
7552    Mr Hamilton 35yo M \r\npresenting with 2mo Hx ...
7553    HPI: 35y m C/O sudden knawing type , 5/10 seve...
7554                                                  NaN
Name: pn_history, Length: 7555, dtype: object

In [13]:
def extract_keywords(pn_history):
    r = Rake(punctuations = [')','(',',',':','),',').','.'])
    r.extract_keywords_from_text(pn_history)
    phrase_df = pd.DataFrame(r.get_ranked_phrases_with_scores(), columns = ['score','phrase'])
    phrase_df.loc[phrase_df.score>5]
    return phrase_df['phrase'].tolist()

In [14]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [17]:
notes["keywords"] = notes["pn_history"].apply(lambda x: extract_keywords(str(x)) if isinstance(x, str) else [])


In [18]:
notes.sample(10)

Unnamed: 0,pn_history,keywords
6075,Chad Hamilton is a 35 year old man who uses mo...,"[intermittent 5 / 10 non radiating burning, pr..."
459,HPI: 17yo M p/w 'heart pounding'. 2-3 months ...,[associated dizziness / tremors / headache / n...
3121,Mrs. Montgomery is a 44 yo F with a 3 year hx ...,[implanted 10 years ago - pmh - htn - psh - no...
1879,"CC: ""heart pounding""\r\nHPI: 17 y/o M presents...","[1 - 2 times per month, past 3 - 4 months, "" h..."
4496,Pt is a 44yo female with history of hypertensi...,"[2 vaginal deliveries > 10years ago w /, iud p..."
37,17 year old M presents with CC of heart palpat...,"[patient experenced chest tightness asociated,..."
2643,20 y/o female c/o abdominal pain. Onset was 8-...,"[non - mucoid diarrhea every day, occasionally..."
3635,44 yo F comes to clinic complaining of irregul...,[also experienced night sweats beginning last ...
2916,CC:20Yo F co abdominal pain\r\nHPI:Abd painx10...,"[20yo f co abdominal pain hpi, 3 - 4 similar e..."
7498,Mr Hamilton is a 35 yo M presenting to the cli...,"[400 mg ibuprofen 1 - 2 times / wk, 5 - 1 ppd ..."


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(stop_words='english')

In [20]:
notes

Unnamed: 0,pn_history,keywords
0,"17-year-old male, has come to the student heal...","[treatment - began 2 - 3 months ago, non - all..."
1,17 yo male with recurrent palpitations for the...,"[baskeball game two days ago light headedness,..."
2,Dillon Cleveland is a 17 y.o. male patient wit...,"[smoking ; 3 - 4 drinks, weekend per sitting ;..."
3,a 17 yo m c/o palpitation started 3 mos ago; \...,[nausea vomiting ; headache ; abdominal pain ;...
4,17yo male with no pmh here for evaluation of p...,"[endorse theses attacks occuring 1 - 2 times, ..."
...,...,...
7550,Mr. Hamilton is a 35 yr old presenting w/ con...,"[35 yr old presenting w / concerns, 15 pack ye..."
7551,35yo M presents with complaints of stomach pro...,"[1 / 2 - 1 packs per day, notes worsening burn..."
7552,Mr Hamilton 35yo M \r\npresenting with 2mo Hx ...,[construction worker smokes 1 / 2 - 1ppd since...
7553,"HPI: 35y m C/O sudden knawing type , 5/10 seve...","[dark stool since last 2 wks without, hasnt wo..."


In [21]:
notes.drop(["keywords"], axis=1, inplace=True)

In [22]:
notes

Unnamed: 0,pn_history
0,"17-year-old male, has come to the student heal..."
1,17 yo male with recurrent palpitations for the...
2,Dillon Cleveland is a 17 y.o. male patient wit...
3,a 17 yo m c/o palpitation started 3 mos ago; \...
4,17yo male with no pmh here for evaluation of p...
...,...
7550,Mr. Hamilton is a 35 yr old presenting w/ con...
7551,35yo M presents with complaints of stomach pro...
7552,Mr Hamilton 35yo M \r\npresenting with 2mo Hx ...
7553,"HPI: 35y m C/O sudden knawing type , 5/10 seve..."


In [23]:
text_data = notes["pn_history"].tolist()
print(len(text_data))

7555


In [24]:
bow.fit(text_data[:50])
bow_features = bow.transform(text_data[:50])
bow_feature_array = bow_features.toarray()

In [26]:
for sentence, feature in zip(text_data, bow_feature_array):
    print("Sentence: ")
    print(sentence)
    # Initialize an empty list for keywords
    lst = []
    for index, element in enumerate(feature.tolist()):
        if element > 0:
            # Access feature names using get_feature_names_out() method
            keyword = bow.get_feature_names_out()[index]
            lst.append(keyword)
    print("Keywords: ")
    print(lst)
    print("____________________________________________")


Sentence: 
17-year-old male, has come to the student health clinic complaining of heart pounding. Mr. Cleveland's mother has given verbal consent for a history, physical examination, and treatment
-began 2-3 months ago,sudden,intermittent for 2 days(lasting 3-4 min),worsening,non-allev/aggrav
-associated with dispnea on exersion and rest,stressed out about school
-reports fe feels like his heart is jumping out of his chest
-ros:denies chest pain,dyaphoresis,wt loss,chills,fever,nausea,vomiting,pedal edeam
-pmh:non,meds :aderol (from a friend),nkda
-fh:father had MI recently,mother has thyroid dz
-sh:non-smoker,mariguana 5-6 months ago,3 beers on the weekend, basketball at school
-sh:no std
Keywords: 
['17', 'aderol', 'aggrav', 'ago', 'allev', 'associated', 'basketball', 'beers', 'began', 'chest', 'chills', 'cleveland', 'clinic', 'come', 'complaining', 'consent', 'days', 'denies', 'dispnea', 'dyaphoresis', 'dz', 'edeam', 'examination', 'exersion', 'father', 'fe', 'feels', 'fever