
# **Bart**

In [1]:
from transformers import pipeline, AutoTokenizer
import re

In [2]:
# Load summarizer model and tokenizer
model_name = "facebook/bart-large-cnn" # Use BART pre-trained for summarization
summarizer = pipeline("summarization", model=model_name, device=-1) # CPU-based summarization pipeline
tokenizer = AutoTokenizer.from_pretrained(model_name) # Tokenizer to manage input token limits

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [3]:
def preprocess_text(text):
    """
    Clean the input text by removing special characters and trimming extra spaces.
    """
    text = re.sub(r'[^\w\s.,!?]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text

In [4]:
def chunk_text(text, max_tokens=1024):
    """
    Split the input text into chunks respecting the model's token limit.
    """
    sentences = text.split('. ') # Split text into sentences
    chunks = []
    current_chunk = []

    current_length = 0
    for sentence in sentences:
        tokenized_length = len(tokenizer.tokenize(sentence)) # Calculate token count for the sentence
        if current_length + tokenized_length <= max_tokens:  # Check if adding the sentence exceeds token limit
            current_chunk.append(sentence)
            current_length += tokenized_length
        else: # Create a new chunk if limit is reached
            chunks.append(". ".join(current_chunk))
            current_chunk = [sentence]
            current_length = tokenized_length

    if current_chunk:  # Add remaining sentences to the final chunk
        chunks.append(". ".join(current_chunk))

    return chunks

In [5]:
def summarize_text(text, max_length=150, min_length=30, do_sample=False):
    """
    Summarize the input text using pre-trained BART model.
    """
    text = preprocess_text(text)

    if not text.strip():
        raise ValueError("Input text is empty or invalid.")

    # Split text into tokenization-aware chunks
    text_chunks = chunk_text(text, max_tokens=1024)
    summaries = []

    for i, chunk in enumerate(text_chunks):
        print(f"Processing chunk {i + 1}/{len(text_chunks)}...")
        try:
            summary = summarizer(
                chunk,
                max_length=max_length,
                min_length=min_length,
                do_sample=do_sample
            )
            summaries.append(summary[0]['summary_text'])
        except Exception as e:
            print(f"Error processing chunk {i + 1}: {e}")

    # Combine summaries into a single text
    return " ".join(summaries)

In [6]:
# Example usage
if __name__ == "__main__":
    # Load the transcribed text from a file
    with open("transcribed_text_nptel_video.txt", "r") as file:
        transcribed_text = file.read()

    try:
        # Summarize the text
        summarized_text = summarize_text(transcribed_text)

        # Save the summarized text to a file
        with open("summarized_text_bart.txt", "w") as file:
            file.write(summarized_text)

        print("Summarization completed.")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

Processing chunk 1/7...
Error processing chunk 1: index out of range in self
Processing chunk 2/7...
Error processing chunk 2: index out of range in self
Processing chunk 3/7...
Error processing chunk 3: index out of range in self
Processing chunk 4/7...
Error processing chunk 4: index out of range in self
Processing chunk 5/7...
Error processing chunk 5: index out of range in self
Processing chunk 6/7...
Error processing chunk 6: index out of range in self
Processing chunk 7/7...
Summarization completed.


## **LexRankSummarizer**

In [11]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

In [12]:
def extractive_summarization(file_path, method="lexrank", sentence_count=5):
    """
    Perform extractive summarization using Sumy.
    """
    try:
        # Read the input text
        with open(file_path, "r") as file:
            text = file.read()

        # Initialize Sumy parser and tokenizer
        parser = PlaintextParser.from_string(text, Tokenizer("english"))
        stemmer = Stemmer("english")

        # Select summarizer
        if method.lower() == "lexrank":
            summarizer = LexRankSummarizer(stemmer) # LexRank algorithm
        elif method.lower() == "lsa":
            summarizer = LsaSummarizer(stemmer) # Latent Semantic Analysis (LSA) algorithm
        else:
            raise ValueError(f"Invalid method: {method}")

        # Set stop words to improve summarization quality
        summarizer.stop_words = get_stop_words("english")

        # Generate the summary with the specified sentence count
        summary = summarizer(parser.document, sentence_count)

        # Combine sentences into a single string
        return " ".join(str(sentence) for sentence in summary)

    except Exception as e:
        return f"Error during summarization: {e}"

In [15]:
if __name__ == "__main__":
    input_file = "transcribed_text_nptel_video.txt"  # Input file
    output_file = "extractive_summary_lexrank.txt"  # Output file
    summary = extractive_summarization(input_file, method="lexrank", sentence_count=5)  # Adjust method and count
    print("Extractive Summary:\n")
    print(summary)

    # Save the summary to a file
    with open(output_file, "w") as file:
        file.write(summary)
    print(f"\nSummary saved to {output_file}")


Extractive Summary:

Suppose, I am talking about the language English, can you always say that wherever I have a dot it is the end of the sentence. So, let us see, I have here, I am here and my word is 4.3. So, my two classes here are end of the sentence and the end of the sentence and what are the observations we were having in general it might be an abbreviation the case of the word and that it is before the dot may be uppercase or lower case and one of these might indicate one class other might indicate other class. Not the sentence is what the words. So, now when I talk about this problem of tokenization in Sanskrit or in English this problem is also called word segmentation.

Summary saved to extractive_summary_lexrank.txt


In [None]:
!pip install sumy

In [None]:
import nltk
nltk.download('punkt_tab')