<a href="https://colab.research.google.com/github/Shivii-Jain/Text-Summarizer/blob/main/Text_Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyPDF2
!pip install nltk
!pip install spacy

In [2]:
# import libraries
import sys
import math
import bs4 as bs
import urllib.request
import re
import PyPDF2
import nltk
from nltk.stem import WordNetLemmatizer
import spacy
import os
from PyPDF2 import PdfReader

In [3]:
# Load Spacy model
nlp = spacy.load('en_core_web_sm')
lemmatizer = WordNetLemmatizer()

Define functions for Reading Input Text

In [4]:
# Function to Read .txt File and return its Text
def file_text(filepath):
    with open(filepath) as f:
        text = f.read().replace("\n", '')
        return text

In [5]:
# Function to Read PDF File and return its Text
def pdfReader(pdf_path):
    pdfReader = PdfReader(pdf_path)
    count = len(pdfReader.pages)
    print("\nTotal Pages in pdf = ", count)

    c = 'Y'
    start_page = 0
    end_page = count - 1
    c = input("Do you want to read entire pdf ?[Y]/N  :  ")
    if c == 'N' or c == 'n':
        start_page = int(input("Enter start page number (Indexing starts from 0) :  "))
        end_page = int(input(f"Enter end page number (Less than {count}) : "))

        if start_page < 0 or start_page >= count:
            print("\nInvalid Start page given")
            sys.exit()

        if end_page < 0 or end_page >= count:
            print("\nInvalid End page given")
            sys.exit()

    text = ""
    for i in range(start_page, end_page + 1):
        page = pdfReader.pages[i]
        text += page.extract_text()

    return text

In [6]:
# Function to Read Wikipedia page URL and return its Text
def wiki_text(url):
    scrap_data = urllib.request.urlopen(url)
    article = scrap_data.read()
    parsed_article = bs.BeautifulSoup(article, 'lxml')

    paragraphs = parsed_article.find_all('p')
    article_text = ""

    for p in paragraphs:
        article_text += p.text

    # Removing all unwanted characters
    article_text = re.sub(r'\[[0-9]*\]','', article_text)
    return article_text

Getting Text

In [7]:
import math
import spacy
from nltk.stem import WordNetLemmatizer
from google.colab import files

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
def summarize_text(input_text_type, input_data=None):
    # Step 1: Get text input
    if input_text_type == 1:  # Typed input
        text = input_data
    elif input_text_type == 2:  # File input
        print("Upload your file (.txt or .pdf)")
        uploaded = files.upload()
        file_path = list(uploaded.keys())[0]
        if file_path.endswith(".txt"):
            text = file_text(file_path)
        elif file_path.endswith(".pdf"):
            text = pdfReader(file_path)
        else:
            print("Unsupported file type!")
            return None, None
    elif input_text_type == 4:  # Wikipedia URL
        text = wiki_text(input_data)
    else:
        print("Invalid input type!")
        return None, None

    # Step 2: Preprocess text and split into sentences
    original_words = [w for w in text.split() if w.isalnum()]
    num_words_in_original_text = len(original_words)
    text = nlp(text)
    sentences = list(text.sents)
    total_sentences = len(sentences)

    # Helper functions for processing
    def frequency_matrix(sentences):
        freq_matrix = {}
        stopWords = nlp.Defaults.stop_words
        for sent in sentences:
            freq_table = {}
            words = [word.text.lower() for word in sent if word.text.isalnum()]
            for word in words:
                word = lemmatizer.lemmatize(word)
                if word not in stopWords:
                    freq_table[word] = freq_table.get(word, 0) + 1
            freq_matrix[sent[:15]] = freq_table
        return freq_matrix

    def tf_matrix(freq_matrix):
        tf_matrix = {}
        for sent, freq_table in freq_matrix.items():
            tf_table = {word: count / len(freq_table) for word, count in freq_table.items()}
            tf_matrix[sent] = tf_table
        return tf_matrix

    def sentences_per_words(freq_matrix):
        sent_per_words = {}
        for _, freq_table in freq_matrix.items():
            for word in freq_table.keys():
                sent_per_words[word] = sent_per_words.get(word, 0) + 1
        return sent_per_words

    def idf_matrix(freq_matrix, sent_per_words, total_sentences):
        idf_matrix = {}
        for sent, freq_table in freq_matrix.items():
            idf_table = {word: math.log10(total_sentences / float(sent_per_words[word])) for word in freq_table.keys()}
            idf_matrix[sent] = idf_table
        return idf_matrix

    def tf_idf_matrix(tf_matrix, idf_matrix):
        return {
            sent: {word: tf_value * idf_matrix[sent][word] for word, tf_value in freq_table.items()}
            for sent, freq_table in tf_matrix.items()
        }

    def score_sentences(tf_idf_matrix):
        return {
            sent: sum(scores.values()) / len(scores) if scores else 0
            for sent, scores in tf_idf_matrix.items()
        }

    def average_score(sentence_scores):
        return sum(sentence_scores.values()) / len(sentence_scores)

    def create_summary(sentences, sentence_scores, threshold):
        return " ".join(
            sentence.text for sentence in sentences
            if sentence[:15] in sentence_scores and sentence_scores[sentence[:15]] >= threshold
        )

    # Step 3: Generate summary
    freq_matrix_result = frequency_matrix(sentences)
    tf_matrix_result = tf_matrix(freq_matrix_result)
    sent_per_words_result = sentences_per_words(freq_matrix_result)
    idf_matrix_result = idf_matrix(freq_matrix_result, sent_per_words_result, total_sentences)
    tf_idf_matrix_result = tf_idf_matrix(tf_matrix_result, idf_matrix_result)
    sentence_scores_result = score_sentences(tf_idf_matrix_result)
    threshold = 1.3 * average_score(sentence_scores_result)
    summary = create_summary(sentences, sentence_scores_result, threshold)

    # Step 4: Display results
    summary_stats = {
        "original_word_count": num_words_in_original_text,
        "summary_word_count": len(summary.split()),
    }

    print("\n\n", "*" * 20, "Summary", "*" * 20, "\n")
    print(summary)
    print("\n\n", "Original Words:", summary_stats['original_word_count'],
          "| Summary Words:", summary_stats['summary_word_count'])

    return summary, summary_stats


In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

**Summarize from typed input**

summary, stats = summarize_text(1, "Your input text here")

**Summarize from a file**

summary, stats = summarize_text(2)

**Summarize from a Wikipedia URL**

summary, stats = summarize_text(4,"link")

In [10]:
summary,stats = summarize_text(1,"Topic sentences are similar to mini thesis statements. Like a thesis statement, a topic sentence has a specific main point. Whereas the thesis is the main point of the essay, the topic sentence is the main point of the paragraph. Like the thesis statement, a topic sentence has a unifying function. But a thesis statement or topic sentence alone doesn’t guarantee unity. An essay is unified if all the paragraphs relate to the thesis, whereas a paragraph is unified if all the sentences relate to the topic sentence. Note: Not all paragraphs need topic sentences. In particular, opening and closing paragraphs, which serve different functions from body paragraphs, generally don’t have topic sentences.")



 ******************** Summary ******************** 

An essay is unified if all the paragraphs relate to the thesis, whereas a paragraph is unified if all the sentences relate to the topic sentence.


 Original Words: 95 | Summary Words: 26


In [None]:
summary, stats = summarize_text(2)