<a href="https://colab.research.google.com/github/Sonalisingh9/gyrus-project/blob/main/Gyrus_model_summerizer_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Recommended installation commands
!pip install -U pip
!pip install torch
!pip install transformers
!pip install bert-extractive-summarizer
!pip install rouge-score
!pip install gradio
!pip install nltk



In [7]:
!pip install transformers torch gradio rouge-score bert-extractive-summarizer nltk



In [12]:
import re
import torch
import gradio as gr
import unicodedata
import logging
import sys
import traceback

# Ensure these are installed
try:
    import nltk
    from nltk.tokenize import sent_tokenize, word_tokenize
    from rouge_score import rouge_scorer
    from summarizer import Summarizer
    from transformers import AutoTokenizer, AutoModelWithLMHead
except ImportError as e:
    print(f"Import Error: {e}")
    print("Please install missing libraries using:")
    print("pip install nltk rouge-score bert-extractive-summarizer transformers")
    sys.exit(1)

# Download NLTK data
try:
    nltk.download('punkt', quiet=True)
except Exception as e:
    print(f"NLTK Download Error: {e}")

# Configure logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

class TextPreprocessor:
    @classmethod
    def preprocess_text(cls, text):
        """Safe text preprocessing"""
        try:
            # Validate input
            if not text or not isinstance(text, str):
                raise ValueError("Invalid input text")

            # Remove special tokens and markup
            text = re.sub(r'<pad>', '', text)
            text = re.sub(r'</s>', '', text)
            text = re.sub(r'<[^>]+>', '', text)

            # Normalize unicode characters
            text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

            # Remove extra whitespaces
            text = re.sub(r'\s+', ' ', text).strip()

            return text
        except Exception as e:
            logger.error(f"Preprocessing error: {e}")
            logger.error(traceback.format_exc())
            return text

class TextSummarizer:
    def __init__(self):
        """Safe model initialization with detailed error tracking"""
        try:
            # Disable warnings for cleaner output
            import warnings
            warnings.filterwarnings('ignore')

            # BERT Extractive Summarizer
            self.bert_model = Summarizer()

            # T5 Transformer Model
            self.t5_tokenizer = AutoTokenizer.from_pretrained('t5-base')
            self.t5_model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)

            # ROUGE Scorer
            self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        except Exception as e:
            logger.error(f"Model Initialization Error: {e}")
            logger.error(traceback.format_exc())
            raise

    def calculate_rouge_scores(self, original_text, summary):
        """
        Calculate ROUGE scores for the generated summary

        Args:
            original_text (str): Original input text
            summary (str): Generated summary

        Returns:
            dict: ROUGE scores
        """
        try:
            # Calculate ROUGE scores
            rouge_scores = self.rouge_scorer.score(original_text, summary)

            # Format ROUGE scores for readability
            formatted_scores = {
                'ROUGE-1 Precision': rouge_scores['rouge1'].precision,
                'ROUGE-1 Recall': rouge_scores['rouge1'].recall,
                'ROUGE-1 F1 Score': rouge_scores['rouge1'].fmeasure,
                'ROUGE-2 Precision': rouge_scores['rouge2'].precision,
                'ROUGE-2 Recall': rouge_scores['rouge2'].recall,
                'ROUGE-2 F1 Score': rouge_scores['rouge2'].fmeasure,
                'ROUGE-L Precision': rouge_scores['rougeL'].precision,
                'ROUGE-L Recall': rouge_scores['rougeL'].recall,
                'ROUGE-L F1 Score': rouge_scores['rougeL'].fmeasure
            }

            return formatted_scores
        except Exception as e:
            logger.error(f"ROUGE Score Calculation Error: {e}")
            logger.error(traceback.format_exc())
            return {}

    def advanced_summarization(self, lecture_transcript):
        """Advanced summarization with ROUGE score calculation"""
        try:
            # Validate input
            if not lecture_transcript:
                return "No text provided", 0, 0, {}

            # Preprocess the transcript
            preprocessed_text = TextPreprocessor.preprocess_text(lecture_transcript)

            # Tokenize into sentences
            try:
                sentences = sent_tokenize(preprocessed_text)
            except Exception as e:
                logger.error(f"Sentence Tokenization Error: {e}")
                sentences = preprocessed_text.split('.')

            # Calculate summary strategy based on text length
            total_sentences = len(sentences)
            summary_sentence_count = max(3, min(10, total_sentences // 4))

            # Select representative sentences
            summary_sentences = sentences[:summary_sentence_count]
            final_summary = ' '.join(summary_sentences)

            # Fallback word counting
            try:
                original_word_count = len(word_tokenize(preprocessed_text))
                summary_word_count = len(word_tokenize(final_summary))
            except Exception:
                original_word_count = len(preprocessed_text.split())
                summary_word_count = len(final_summary.split())

            # Calculate ROUGE scores
            rouge_scores = self.calculate_rouge_scores(preprocessed_text, final_summary)

            return final_summary, original_word_count, summary_word_count, rouge_scores
        except Exception as e:
            logger.error(f"Advanced Summarization Error: {e}")
            logger.error(traceback.format_exc())
            return f"Error: {str(e)}", 0, 0, {}

    def t5_paragraph_summarization(self, lecture_transcript):
        """T5 Summarization with ROUGE score calculation"""
        try:
            # Validate input
            if not lecture_transcript:
                return "No text provided", 0, 0, {}

            # Preprocess transcript
            preprocessed_text = TextPreprocessor.preprocess_text(lecture_transcript)

            # Tokenize into sentences
            try:
                sentences = sent_tokenize(preprocessed_text)
            except Exception as e:
                logger.error(f"Sentence Tokenization Error: {e}")
                sentences = preprocessed_text.split('.')

            # Limit input length for T5
            max_input_length = 512
            input_text = ' '.join(sentences[:20])  # Limit to first 20 sentences

            # Tokenize and generate summary
            inputs = self.t5_tokenizer.encode(
                "summarize: " + input_text,
                return_tensors='pt',
                max_length=max_input_length,
                truncation=True
            )

            # Generate summary with controlled length
            summary_ids = self.t5_model.generate(
                inputs,
                max_length=150,  # Limit summary length
                min_length=50,
                num_beams=4,
                early_stopping=True
            )

            # Decode summary
            final_summary = self.t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

            # Fallback word counting
            try:
                original_word_count = len(word_tokenize(preprocessed_text))
                summary_word_count = len(word_tokenize(final_summary))
            except Exception:
                original_word_count = len(preprocessed_text.split())
                summary_word_count = len(final_summary.split())

            # Calculate ROUGE scores
            rouge_scores = self.calculate_rouge_scores(preprocessed_text, final_summary)

            return final_summary, original_word_count, summary_word_count, rouge_scores
        except Exception as e:
            logger.error(f"T5 Summarization Error: {e}")
            logger.error(traceback.format_exc())
            return f"Error: {str(e)}", 0, 0, {}

def create_gradio_interface():
    """Create Gradio interface with error handling"""
    # Initialize summarizer
    summarizer = TextSummarizer()

    def summarize_text(transcript):
        """Wrapper function for summarization with unified error handling"""
        try:
            # Perform both summarization methods
            adv_summary_result = summarizer.advanced_summarization(transcript)
            t5_summary_result = summarizer.t5_paragraph_summarization(transcript)

            # Combine results
            combined_results = list(adv_summary_result) + list(t5_summary_result)

            return combined_results
        except Exception as e:
            logger.error(f"Interface Error: {e}")
            return ["Error in summarization"] * 8

    # Create Gradio interface with expanded outputs
    iface = gr.Interface(
        fn=summarize_text,
        title="Text Summarizer with ROUGE Scores",
        description="Compare different summarization techniques and their ROUGE scores",
        inputs=gr.Textbox(
            lines=10,
            placeholder="Paste the article here...",
            label="Input Transcript"
        ),
        outputs=[
            gr.Textbox(label="Advanced Summary"),
            gr.Number(label="Original Text Word Count"),
            gr.Number(label="Summary Word Count"),
            gr.JSON(label="Text Summarizer with ROUGE Scores"),
            gr.Textbox(label="T5 Abstractive Summary"),
            gr.Number(label="T5 Original Word Count"),
            gr.Number(label="T5 Summary Word Count"),
            gr.JSON(label="T5 Summary ROUGE Scores")
        ]
    )

    return iface

def main():
    """Launch the Gradio interface"""
    iface = create_gradio_interface()
    iface.launch()

if __name__ == "__main__":
    main()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d9e4081b490d7f75bc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
