<a href="https://colab.research.google.com/github/SaiSpandanatumu26/text-summarisation-using-NLP/blob/main/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from transformers import BartTokenizer, BartForConditionalGeneration, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import torch

# Download necessary NLTK resources
nltk.download('punkt')

class HybridSummarizer:
    def __init__(self, model_name="facebook/bart-large-cnn"):
        # Load BART model for abstractive summarization
        self.model = BartForConditionalGeneration.from_pretrained(model_name)
        self.tokenizer = BartTokenizer.from_pretrained(model_name)
        self.summarizer = pipeline("summarization", model=self.model, tokenizer=self.tokenizer)

    def extractive_summary(self, text, num_sentences=5):
        """
        Extractive summarization using TF-IDF to first reduce the text size
        before passing to an abstractive model for final summarization.
        """
        sentences = nltk.sent_tokenize(text)
        if len(sentences) <= num_sentences:
            return text  # If too few sentences, return original text

        # TF-IDF to rank sentence importance
        tfidf_vectorizer = TfidfVectorizer(stop_words="english")
        tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

        # Score sentences based on TF-IDF and select the top ones
        sentence_scores = np.sum(tfidf_matrix.toarray(), axis=1)
        top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:]

        # Sort to maintain the order of sentences
        top_sentence_indices.sort()
        summary = " ".join([sentences[i] for i in top_sentence_indices])
        return summary

    def abstractive_summary(self, text, max_length=150, min_length=50):
        """
        Use BART for abstractive summarization. This handles up to 1024 tokens per input.
        """
        summary = self.summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
        return summary[0]['summary_text']

    def summarize(self, text, num_sentences=5, max_length=150, min_length=50):
        """
        Perform a hybrid summarization:
        1. Extractive phase: Reduce the text using extractive summarization (TF-IDF).
        2. Abstractive phase: Pass the reduced text to an abstractive summarizer (BART).
        """
        # Step 1: Extractive Summarization to reduce size
        print("Performing extractive summarization...")
        reduced_text = self.extractive_summary(text, num_sentences=num_sentences)

        # Step 2: Abstractive Summarization on reduced text
        print("Performing abstractive summarization...")
        final_summary = self.abstractive_summary(reduced_text, max_length=max_length, min_length=min_length)
        return final_summary

    def interactive_summary(self):
        """
        Function to interact with the user, getting input text and summarization parameters.
        """
        print("\nEnter your text (press Enter twice to finish):")
        user_input = self._get_user_input()

        print("\nEnter summary parameters (optional, press Enter to use defaults):")
        num_sentences, max_length, min_length = self._get_summary_params()

        print("\nGenerating summary...")
        summary = self.summarize(user_input, num_sentences=num_sentences, max_length=max_length, min_length=min_length)
        print("\nSummary:")
        print(summary)

    def _get_user_input(self):
        """
        Read multi-line user input.
        """
        lines = []
        while True:
            line = input()
            if line:
                lines.append(line)
            else:
                break
        return "\n".join(lines)

    def _get_summary_params(self):
        """
        Get user-defined summarization parameters.
        """
        try:
            num_sentences = int(input("Number of key sentences for extractive phase (default 5): ") or 5)
            max_length = int(input("Max Length for abstractive summary (default 150): ") or 150)
            min_length = int(input("Min Length for abstractive summary (default 50): ") or 50)
        except ValueError:
            print("Invalid input, using default values (Num Sentences: 5, Max Length: 150, Min Length: 50)")
            num_sentences, max_length, min_length = 5, 150, 50
        return num_sentences, max_length, min_length

# Error handling for device compatibility
def check_device():
    """
    Check if a GPU is available for faster processing.
    """
    if torch.cuda.is_available():
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    else:
        print("No GPU detected, using CPU. This may be slower.")

# Main function to run the summarizer
def main():
    """
    Main function to execute the summarization program.
    """
    check_device()

    print("\n*** Welcome to the Advanced Hybrid Text Summarizer ***\n")

    summarizer = HybridSummarizer()

    while True:
        print("\nChoose an option:")
        print("1. Summarize Text")
        print("2. Exit")
        choice = input("Enter your choice: ")

        if choice == "1":
            summarizer.interactive_summary()
        elif choice == "2":
            print("Exiting...")
            break
        else:
            print("Invalid choice. Please try again.")

# Entry point
if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


No GPU detected, using CPU. This may be slower.

*** Welcome to the Advanced Hybrid Text Summarizer ***



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]




Choose an option:
1. Summarize Text
2. Exit
Enter your choice: 1

Enter your text (press Enter twice to finish):
erity opens in contemporary New York City, as protagonist Lowen Ashleigh witnesses a bloody accident where a passerby is hit by a truck. Lowen, a struggling writer, is grieving her mother's recent death and wondering where her foundering career is headed. She writes thrillers, but she’s suffering from intense writer’s block, which her state of uncertainty only seems to worsen. She’s so numbed to reality that she barely registers the accident, even as she’s soaked with the victim’s blood.   To her surprise, a stranger helps clean her up, and they share a moment of intense romantic chemistry. She soon discovers his name is Jeremy Crawford, and that he’s married to bestselling author Verity Crawford. This author—whom Lowen respects enormously for her meticulous research and well–crafted plots—has been left incapacitated by a car accident. Verity is an internationally renowned 