# Get Prerequisites

In [1]:
! pip install wikipedia
! pip install tabulate

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=509ecfa047136bee21e95e4c1ca08b5e869ba4ca3e73e43a9cf768635f91dda7
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [2]:
import nltk
import spacy
import wikipedia
nltk.download('punkt')
nltk.download('stopwords')
from tabulate import tabulate

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Extractive Summarization Part

In [3]:

def find_similarity(sentence1:list, sentence2:list, stopwords=None):
  # Removing all stop words
  filtered_sentence1 = [word.lower() for word in sentence1 if word not in stopwords]
  filtered_sentence2 = [word.lower() for word in sentence2 if word not in stopwords]


  # get unique words
  unique_words = list()
  unique_words.extend(filtered_sentence1)
  unique_words.extend(filtered_sentence2)

  unique_words = list(set(unique_words)) # use set to ensure only one case of each word

  # Get frequency information
  frequency_1 = dict()
  frequency_2 = dict()

  for word in filtered_sentence1:
    frequency_1[word]  = frequency_1.get(word, 0) +1;

  for word in filtered_sentence2:
    frequency_2[word]  = frequency_2.get(word, 0) +1;

  # Create vectors
  size_unique_words = len(unique_words)
  vector_sentence1 = [0 for i in unique_words]
  vector_sentence2 = [0 for i in unique_words]

  # Assigning frequencies to vectors
  for word,frequency in frequency_1.items():
    vector_sentence1[unique_words.index(word)] = frequency

  for word,frequency in frequency_2.items():
    vector_sentence2[unique_words.index(word)] =frequency
  # Calculate similarity between vectors
  return 1-cosine_distance(vector_sentence1, vector_sentence2)

In [4]:
def construct_similarity_matrix(all_sentences, stopwords):
  # Initialize matrix
  matrix = [

            [0 for i in range(len(all_sentences))]
            for j in range(len(all_sentences))
  ]
  # Iterate over all sentences
  for i in range(len(all_sentences)):
    for j in range(len(all_sentences)):
      # Calculate similarity score for each sentence pair
      if(i != j):
        matrix[i][j] = find_similarity(all_sentences[i], all_sentences[j], stopwords)
      else:
        # Skip same sentence pairs.
        continue
  return matrix

In [5]:
def text_rank(similarity_matrix):
    # Initialize scores
    sentence_count = len(similarity_matrix)
    scores = [ 1 for i in range(sentence_count)]
    # Set damping factors
    damping_factor = 0.85
    epsilon = 1e-5

    for x in range(100):
        # Initialize prior scores
        prior_scores = scores.copy()
        for i in range(sentence_count):
            # Calculate new score
            new_score = (1 - damping_factor)

            sum_of_products = 0
            for j in range(sentence_count):
                product = similarity_matrix[j][i] * prior_scores[j]
                sum_of_products += product


            new_score += (damping_factor * sum_of_products)

            # Assign new score
            scores[i] = new_score
        # Check if the change a minute change.
        if sum(abs(scores[i] - prior_scores[i]) for i in range(sentence_count)) < epsilon:
            break

    return scores

In [6]:
def extract_summary(text, top_n=5):

    stop_words = set(stopwords.words('english'))

    # Get the sentences from the original text
    sentences = sent_tokenize(text)

    # Get the similarity matrix from all sentences
    sentence_similarity_matrix = construct_similarity_matrix(sentences, stop_words)

    # Get the scores for each sentence
    scores = text_rank(sentence_similarity_matrix)

    # Put the sentences in decsending order based on the score
    ranked_sentences = []
    for i, s in enumerate(sentences):
        ranked_sentences.append((scores[i], s))

    ranked_sentences = sorted(ranked_sentences, reverse=True)

    # Get the summary sentences and make a combine string.
    summary = [sentence for score, sentence in ranked_sentences[:top_n]]
    return " ".join(summary)

# Load NER Model

In [7]:
model_path = "ner_model"
nlp = spacy.load(model_path)

# Code for getting Wikipedia Paragraphs

In [8]:
from bs4 import BeautifulSoup

def get_wikipedia_text(term,label):
  try:
    # print("Search Results : ", wikipedia.search(term))
    page = wikipedia.page(wikipedia.search(term)[0], auto_suggest=False)
    paragraphs = page.content.split("\n")
    return " ".join(paragraphs[0:5])
  except Exception as e:
    #  print("AHHH...")
    # Ensure that text of most relvance is retrieved
    page = wikipedia.page(wikipedia.search(term + " " + label)[0], auto_suggest=True)
    paragraphs = page.content.split("\n")
    return " ".join(paragraphs[0:5])

# GUI Component and NLP Implementation

In [9]:
headings = ["Term ", "Summary"]
term_summary = list()
# !pip install tqdm
from tqdm import tqdm

from ipywidgets import Text, Button, VBox

text = Text(description='Enter Text')
button = Button(description="Execute")

def on_click(b):
  doc = nlp(text.value)
  print("\n\nNER Model In Action : ")
  colors = {
    "SIGN_SYMPTOM": "#FF8080",
    "DISEASE_DISORDER": "#FFCF96",
    "MEDICATION": "#F6FDC3",
    "DIAGNOSTIC_PROCEDURE": "#CDFAD5",
    "BIOLOGICAL_STRUCTURE": "#2C4E80",
  }

  spacy.displacy.render(doc, style="ent", jupyter=True, options = {"colors": colors})
  uniques_ents = set(doc.ents)


  for ent in tqdm(uniques_ents):
  # for ent in (uniques_ents):

    wiki_text = get_wikipedia_text(ent.text, ent.label_)
    term_summary.append([ent.text, extract_summary(wiki_text, 2)])
    # print(ent.label_)

  table = tabulate(term_summary, headers=headings, tablefmt="grid")
  print("\nEnd Result : ")
  print(table)

button.on_click(on_click)

VBox([text, button])
# Patient John Doe, a 45-year-old male, presented with a persistent cough, fever, and shortness of breath. The doctor ordered a chest X-ray and blood tests to diagnose the underlying condition. The X-ray revealed signs of pneumonia, and the blood work showed elevated white blood cell count, indicating an infection. The patient was prescribed a course of Azithromycin, an antibiotic, to treat the bacterial pneumonia. Additionally, the doctor recommended taking Ibuprofen to alleviate the fever and body aches associated with the illness. Mr. Doe has a history of hypertension and is currently on Lisinopril to manage his high blood pressure. He also has Type 2 diabetes mellitus and takes Metformin regularly to control his blood sugar levels. During the follow-up visit, the physician noted that the patient's symptoms had improved, and the pneumonia was resolving. However, the doctor advised Mr. Doe to complete the entire course of antibiotics as prescribed to prevent a relapse. Furthermore, the doctor recommended a pulmonary function test to evaluate the patient's lung capacity and rule out any underlying chronic respiratory conditions, such as asthma or chronic obstructive pulmonary disease (COPD). In addition to the medical conditions, the patient reported experiencing occasional heartburn and gastric discomfort. The doctor suggested taking an over-the-counter antacid like Omeprazole to manage the symptoms of acid reflux. Overall, with proper treatment and medication management, the patient's condition is expected to improve, and the risk of complications should be minimized.

VBox(children=(Text(value='', description='Enter Text'), Button(description='Execute', style=ButtonStyle())))



NER Model In Action : 




  lis = BeautifulSoup(html).find_all('li')
100%|██████████| 31/31 [00:28<00:00,  1.08it/s]


End Result : 
+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Term                     | Summary                                                                                                                                                                                                                                                                                                                           


