In [8]:
#Expanding Contractions for frequently used shortforms
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
    }

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def build_feature_matrix(documents, feature_type='frequency'):

    feature_type = feature_type.lower().strip()  
    
    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=1, 
                                     ngram_range=(1, 1))
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=1, 
                                     ngram_range=(1, 1))
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=1, 
                                     ngram_range=(1, 1))
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)
    
    return vectorizer, feature_matrix


from scipy.sparse.linalg import svds
    
def low_rank_svd(matrix, singular_count=2):
    
    u, s, vt = svds(matrix, k=singular_count)
    return u, s, vt




In [10]:
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from html.parser import HTMLParser
import unicodedata

stopword_list = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()
html_parser = HTMLParser()

# tokenize the report into tokens
def tokenize_text(text):
    tokens = nltk.word_tokenize(text) 
    tokens = [token.strip() for token in tokens]
    return tokens

# Match the shortforms used in the report by doctors and replace them with the correct words
def expand_contractions(text, contraction_mapping):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
    

from pattern.en import tag
from nltk.corpus import wordnet as wn

# Annotate text tokens with POS tags
def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text
    
# lemmatize text based on POS tags    
def lemmatize_text(text):
    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word                     
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text
    

# to eliminate special caracters from report
def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))     #string.punctuation = !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~
    filtered_tokens = filter(None, [pattern.sub(' ', token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
    
# to eliminate stop words which do not provide any useful info
def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text


# to remove any html related syntax
def unescape_html(parser, text):
    
    return parser.unescape(text)

#normalization of text
def normalize_corpus(corpus, lemmatize=True,tokenize=False):
    
    normalized_corpus = []  
    for text in corpus:
        text = html_parser.unescape(text)
        text = expand_contractions(text, CONTRACTION_MAP)
        if lemmatize:
            text = lemmatize_text(text)
        else:
            text = text.lower()
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)
        
            
    return normalized_corpus



# parse the document to check non ascii characters
def parse_document(document):
    document = re.sub('\n', ' ', document)
    if isinstance(document, str):
        document = document
    elif isinstance(document, unicode):
        return unicodedata.normalize('NFKD', document).encode('ascii', 'ignore')
    else:
        raise ValueError('Document is not string or unicode!')
    document = document.strip()
    sentences = nltk.sent_tokenize(document)
    sentences = [sentence.strip() for sentence in sentences]
    return sentences

In [11]:
import numpy as np
import docx2txt
import glob
from gensim.summarization import summarize, keywords

def text_summarization_gensim(text, summary_ratio=0.5):
    
    summary = summarize(text, split=True, ratio=summary_ratio)
    for sentence in summary:
        print (sentence)
    
    
def lsa_text_summarizer(documents, num_sentences=2,num_topics=2, feature_type='frequency',sv_threshold=0.5):
                            
    vec, dt_matrix = build_feature_matrix(documents, feature_type)

    td_matrix = dt_matrix.transpose()
    td_matrix = td_matrix.multiply(td_matrix > 0)

    u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)  
    min_sigma_value = max(s) * sv_threshold
    s[s < min_sigma_value] = 0
    
    salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt)))
    top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1]
    top_sentence_indices.sort()
    
    for index in top_sentence_indices:
        print (sentences[index])
    
    
    

import networkx

def textrank_text_summarizer(documents, num_sentences=2,
                             feature_type='frequency'):
    
    vec, dt_matrix = build_feature_matrix(norm_sentences, 
                                      feature_type='tfidf')
    similarity_matrix = (dt_matrix * dt_matrix.T)
        
    similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
    scores = networkx.pagerank(similarity_graph)   
    
    ranked_sentences = sorted(((score, index) 
                                for index, score 
                                in scores.items()), 
                              reverse=True)

    top_sentence_indices = [ranked_sentences[index][1] 
                            for index in range(num_sentences)]
    top_sentence_indices.sort()
    
    for index in top_sentence_indices:
        print (sentences[index])
    


In [12]:
l = list(glob.glob(r"C:\Users\jaswanth\Desktop\Mini Project\*.docx"))

In [13]:
x = 1
for i in l:
    my_text = docx2txt.process(i)
    sentences = parse_document(my_text)
    norm_sentences = normalize_corpus(sentences,lemmatize=False) 
    print ("Total Sentences:", len(norm_sentences))

    """print ("\n---------lsa summarization for document" + str(x) + "--------")
    lsa_text_summarizer(norm_sentences, num_sentences=1,
                        num_topics=2, feature_type='frequency',
                        sv_threshold=0.5)"""

    print ("---------text-rank summarization for document" + str(x) + "--------")
    textrank_text_summarizer(norm_sentences, num_sentences=1,
                             feature_type='tfidf')
    
    x = x + 1



Total Sentences: 58
---------text-rank summarization for document1--------
The patient also does not have any history of autoimmune disease or any reaction similar to this in the past   It is more likely that the etiology is kinin-related where angioedema results from generation of bradykinin and complement-derived mediators that increase vascular permeability since there is no urticaria or pruritis.
Total Sentences: 141
---------text-rank summarization for document2--------
Problem List    Crohn's disease flare (abdominal pain, nausea, vomiting, diarrhea)  Adenocarcinoma of terminal ileum, s/p resection 1998  hx of small bowel obstruction secondary to Crohn’s Disease  DM  HTN  hx of DVT and PE, 2001  PUD  GERD  COPD  Posttraumatic stress disorder  Bipolar disorder  hx of multiple suicide attempts  insomnia  chronic abdominal, back, and left knee pain    osteoarthritis of knee joints  use of cane for walking  nicotine dependence and abuse  hx of narcotic seeking behavior  poor dentitio




Total Sentences: 58
---------text-rank summarization for document4--------
Med W H&P    	new onset of fever, HTN, rigidity and altered mental status    HPI: Mr. -- is an 82 yo gentleman with a history of Alzheimer's dementia, pseudogout, hearing loss and possible PMR who was admitted to Med A and then Psych on 11/16 for aggressive behavior and altered mental status displayed at Carolina Meadows.




Total Sentences: 193
---------text-rank summarization for document5--------
Musculoskeletal pain    Musculoskeletal chest pain must be differentiated from potentially life-threatening causes of chest pain such as MI, PE, or aortic dissection.
Total Sentences: 122
---------text-rank summarization for document6--------




Timothy P. Moran  Patient H&P #14    	Bilateral knee pain    HPI:    The patient is a 24 yo African-American man with h/o sickle cell disease who presented to the ED with a 2 day h/o bilateral knee pain.
Total Sentences: 140
---------text-rank summarization for document7--------




Problem List  	LUNG MASS  	DYSPNEA ON EXERTION  	CHEST PAIN/ HEARTBURN/ TIGHTNESS  	COUGHING/VOMITING  	DECREASED PO INTAKE/WEIGHT LOSS  	SMOKING Hx/NICOTINE ADDICTION    	EtOH INTAKE  	LEUKOCYTOSIS  	FAMILY Hx + for DM  	NO PRIMARY CARE PROVIDER/REGULAR HEALTH CARE      Assessment and Recommendation    Patient is a 51 year old gentleman with no significant past medical history presenting with 3 weeks of dyspnea on light exertion and a 10 lb weight loss in 8 days.
Total Sentences: 118
---------text-rank summarization for document8--------




September 16, 2007      	Chest pain, SOB    HPI    	is a 47 yo African-American woman with a history of uncontrolled HTN, recently diagnosed CHF and dilated cardiomyopathy, and polysubstance abuse who came to the ED at 8:00am this morning c/o chest pain and SOB that started approximately 12 hours prior to presentation.




Total Sentences: 155
---------text-rank summarization for document9--------
Problem List:    	Dysarthria, right sided weakness, AMS, possible incontinence  	CT of head with right side infarct of indeterminate age  	Low K at 3.2    	HTN, uncontrolled  	Hyperlipidemia  	DM type 2  	Cocaine, marijuana, and tobacco abuse  	Residual right sided weakness requiring the use of a cane  	Financial situation limiting ability to attain medications, med noncompliance    Assessment:    This is a 61 yo gentleman with h/o CVA, HTN, hyperlipidemia, seizure d/o, cocaine abuse, and medication noncompliance who presents with increasing right sided weakness and dysarthria of unknown duration less than one day, concerning for TIA versus CVA.
Total Sentences: 28
---------text-rank summarization for document10--------




MEDICAL HISTORY  	Adult Illnesses:    	Polycythemia Vera – diagnosed incidentally three years ago.
Total Sentences: 124
---------text-rank summarization for document11--------
When the patient is looking up, the right eye does not move up as well as the left.
