<a href="https://colab.research.google.com/github/SulemanShahani/Text-summarization-using-BERT-Latent-Semantic-Analysis-LSA-LexRank-and-T5-Transformers/blob/main/text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
from transformers import TFBartForConditionalGeneration, BartTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from string import punctuation
import warnings
import fitz  # PyMuPDF
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.tokenizers import Tokenizer as SumyTokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer
import nltk

# Install PyMuPDF library
!pip install PyMuPDF

# Install sumy library
!pip install sumy

# Download nltk punkt tokenizer
nltk.download('punkt')

# Filter out specific UserWarning messages
warnings.filterwarnings("ignore", message="All PyTorch model weights were used when initializing TFBartForConditionalGeneration.*")

# Initialize BART tokenizer with Byte-Pair Encoding (BPE)
bart_tokenizer_bpe = BartTokenizer.from_pretrained('facebook/bart-large-cnn', tokenizer_type='BPE')

# Load pre-trained BART model
bart_model = TFBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Load pre-trained T5 model and tokenizer
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Define function to read text from a PDF file
def read_pdf(file_path):
    """
    Read text from a PDF file.

    Args:
        file_path (str): The path to the PDF file.

    Returns:
        str: The text extracted from the PDF.
    """
    text = ""
    with fitz.open(file_path) as pdf_file:
        for page_num in range(len(pdf_file)):
            page = pdf_file.load_page(page_num)
            text += page.get_text()
    return text

# Define text preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([c for c in text if c not in punctuation])
    return text

# Read text from a PDF file (You need to provide the file path)
file_path = '/content/eng.pdf'
text = read_pdf(file_path)

# Text preprocessing for BART
processed_text_bart = "summarize " + preprocess_text(text)

# Tokenization using Byte-Pair Encoding (BPE) for BART
input_ids_bart = bart_tokenizer_bpe.encode(processed_text_bart, return_tensors='tf', max_length=1024, truncation=True, add_special_tokens=True)

# Generate the summary with BART
summary_ids_bart = bart_model.generate(input_ids_bart, max_length=500)

# Decode the summary for BART
bart_summary = bart_tokenizer_bpe.decode(summary_ids_bart[0], skip_special_tokens=True)

# Print BART Summary
print("BART Summary:")
print(bart_summary)

# Text preprocessing for T5
processed_text_t5 = "summarize " + preprocess_text(text)

# Encode the input text for T5
input_ids_t5 = t5_tokenizer.encode(processed_text_t5, return_tensors='pt', max_length=2000, truncation=True, add_special_tokens=True)

# Generate the summary with T5
summary_ids_t5 = t5_model.generate(input_ids_t5, max_length=500)

# Decode the summary for T5
t5_summary = t5_tokenizer.decode(summary_ids_t5[0], skip_special_tokens=True)

# Post-process T5 summary to proper case
sentences = t5_summary.split('. ')
proper_case_summary = '. '.join([sentence.capitalize() for sentence in sentences])

# Print T5 Summary
print("T5 Summary (Proper Case):")
print(proper_case_summary)

# Text summarization using LSA
text_to_summarize = text
parser = PlaintextParser.from_string(text_to_summarize, SumyTokenizer('english'))
lsa_summarizer = LsaSummarizer()
lsa_summary = lsa_summarizer(parser.document, 25)

# Print LSA Summary
print("LSA Summary:")
for sentence_lsa in lsa_summary:
    print(sentence_lsa)

# Text summarization using LexRank
lexrank_summarizer = LexRankSummarizer()
lexrank_summary = lexrank_summarizer(parser.document, sentences_count=25)

# Print LexRank Summary
print("LexRank Summary:")
for sentence_lexrank in lexrank_summary:
    print(sentence_lexrank)
