# Code for Extractive Summarization


In [None]:
import nltk
import os, csv
nltk.download('punkt')
nltk.download('stopwords')
# from tabulate import tabulate

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance


from tqdm import tqdm
# CHANGE THIS
base_path = "TestSummarizer"
folder_path = "TestSummarizer/business_summary"

In [2]:

def find_similarity(sentence1:list, sentence2:list, stopwords=None):

  # Removing all stop words
  filtered_sentence1 = [word.lower() for word in sentence1 if word not in stopwords]
  filtered_sentence2 = [word.lower() for word in sentence2 if word not in stopwords]


  # get unique words
  unique_words = list()
  unique_words.extend(filtered_sentence1)
  unique_words.extend(filtered_sentence2)

  unique_words = list(set(unique_words)) # use set to ensure only one case of each word

  # Get frequency information
  frequency_1 = dict()
  frequency_2 = dict()

  for word in filtered_sentence1:
    frequency_1[word]  = frequency_1.get(word, 0) +1;

  for word in filtered_sentence2:
    frequency_2[word]  = frequency_2.get(word, 0) +1;

  # Create vectors
  size_unique_words = len(unique_words)
  vector_sentence1 = [0 for i in unique_words]
  vector_sentence2 = [0 for i in unique_words]

  # Assigning frequencies to vectors
  for word,frequency in frequency_1.items():
    vector_sentence1[unique_words.index(word)] = frequency

  for word,frequency in frequency_2.items():
    vector_sentence2[unique_words.index(word)] =frequency
  # Calculate similarity between vectors
  return 1-cosine_distance(vector_sentence1, vector_sentence2)

In [3]:
def construct_similarity_matrix(all_sentences, stopwords):
  # Initialize matrix
  matrix = [

            [0 for i in range(len(all_sentences))]
            for j in range(len(all_sentences))
  ]
  # Iterate over all sentences
  for i in range(len(all_sentences)):
    for j in range(len(all_sentences)):
      # Calculate similarity score for each sentence pair
      if(i != j):
        matrix[i][j] = find_similarity(all_sentences[i], all_sentences[j], stopwords)
      else:
        # Skip same sentence pairs.
        continue
  return matrix

In [4]:
def text_rank(similarity_matrix):
    # Initialize scores
    sentence_count = len(similarity_matrix)
    scores = [ 1 for i in range(sentence_count)]
    # Set damping factors
    damping_factor = 0.85
    epsilon = 1e-5

    for x in range(100):
        # Initialize prior scores
        prior_scores = scores.copy()
        for i in range(sentence_count):
            # Calculate new score
            new_score = (1 - damping_factor)

            sum_of_products = 0
            for j in range(sentence_count):
                product = similarity_matrix[j][i] * prior_scores[j]
                sum_of_products += product


            new_score += (damping_factor * sum_of_products)

            # Assign new score
            scores[i] = new_score
        # Check if the change a minute change.
        if sum(abs(scores[i] - prior_scores[i]) for i in range(sentence_count)) < epsilon:
            break

    return scores

In [5]:
def extract_summary(text, top_n=5):

    stop_words = set(stopwords.words('english'))

    # Get the sentences from the original text
    sentences = sent_tokenize(text)

    # Get the similarity matrix from all sentences
    sentence_similarity_matrix = construct_similarity_matrix(sentences, stop_words)

    # Get the scores for each sentence
    scores = text_rank(sentence_similarity_matrix)

    # Put the sentences in decsending order based on the score
    ranked_sentences = []
    for i, s in enumerate(sentences):
        ranked_sentences.append((scores[i], s))

    ranked_sentences = sorted(ranked_sentences, reverse=True)

    # Get the summary sentences and make a combine string.
    summary = [sentence for score, sentence in ranked_sentences[:top_n]]
    return " ".join(summary)

# Confimation Test summarizer works

In [6]:
text = "Patient John Doe, a 45-year-old male, presented with a persistent cough, fever, and shortness of breath. The doctor ordered a chest X-ray and blood tests to diagnose the underlying condition. The X-ray revealed signs of pneumonia, and the blood work showed elevated white blood cell count, indicating an infection. The patient was prescribed a course of Azithromycin, an antibiotic, to treat the bacterial pneumonia. Additionally, the doctor recommended taking Ibuprofen to alleviate the fever and body aches associated with the illness. Mr. Doe has a history of hypertension and is currently on Lisinopril to manage his high blood pressure. He also has Type 2 diabetes mellitus and takes Metformin regularly to control his blood sugar levels. During the follow-up visit, the physician noted that the patient's symptoms had improved, and the pneumonia was resolving. However, the doctor advised Mr. Doe to complete the entire course of antibiotics as prescribed to prevent a relapse. Furthermore, the doctor recommended a pulmonary function test to evaluate the patient's lung capacity and rule out any underlying chronic respiratory conditions, such as asthma or chronic obstructive pulmonary disease (COPD). In addition to the medical conditions, the patient reported experiencing occasional heartburn and gastric discomfort. The doctor suggested taking an over-the-counter antacid like Omeprazole to manage the symptoms of acid reflux. Overall, with proper treatment and medication management, the patient's condition is expected to improve, and the risk of complications should be minimized."
text += text;
print(len(word_tokenize(text)))
summary = extract_summary(text, top_n=2)
print(summary)

544


NameError: name 'numpy' is not defined

# Methods for testing

In [None]:
def get_summary_and_original_text(base_path, filename):
  # get orginal text
  original_path = base_path+"//business_text/"+filename
  summary_path = base_path+"/business_summary/"+filename

  original_fhand = open(original_path, 'r')
  summary_fhand = open(summary_path, 'r')

  original_text = "";
  summary_text = "";

  for original_line in original_fhand.readlines():
    original_text += original_line.strip()

  for summary_line in summary_fhand.readlines():
    summary_text += summary_line.strip()

  original_fhand.close()
  summary_fhand.close()

  return (original_text, summary_text)

In [None]:
def make_summary(original_text, max_word_count):
  sentence_count = 1
  test_summary = extract_summary(original_text, top_n=sentence_count)

  while(len(word_tokenize(test_summary)) < max_word_count):
    sentence_count += 1
    test_summary = extract_summary(original_text, top_n=sentence_count)

  return test_summary

In [None]:
def get_summary_similarity( original_text, summary_text):
  created_summary = word_tokenize(make_summary(original_text, len(word_tokenize(summary_text))))
  base_summary_words = word_tokenize(summary_text)
  stop_words = set(stopwords.words('english'))
  return  find_similarity(created_summary, base_summary_words,stop_words)



In [None]:
# Get a list of all files in the folder
files_in_folder = os.listdir(folder_path)

# Print the names of all files in the folder
count = 0
sum_score = 0
all_files =[["file_name", "score"]]

for file_name in tqdm(files_in_folder):
    original_text, summary_text = get_summary_and_original_text(base_path, file_name)
    score = get_summary_similarity(original_text, summary_text)
    all_files.append([file_name, score])

    count += 1
    sum_score += score



print("\n", (sum_score/count))


100%|██████████| 510/510 [01:55<00:00,  4.40it/s]


 0.7351392667558622





In [None]:
csv_file = "summary_scores.csv"
csv_fhand = open(csv_file, "w", newline="")
writer = csv.writer(csv_fhand)
writer.writerows(all_files)
csv_fhand.close()
