<a href="https://colab.research.google.com/github/Sidhtang/data-analysis-project-s/blob/main/research_paper_summarise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install PyPDF2
!pip install gradio

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting gradio
  Downloading gradio-5.4.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)

In [None]:
!pip install transformers[torch]



In [2]:
import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import PyPDF2
import torch
import gc
import re

class ComprehensiveSummarizer:
    def __init__(self):
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')

        # Using BART-large-CNN for better quality summaries
        model_name = "facebook/bart-large-cnn"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(self.device)
        self.model.eval()

        # Modified pipeline initialization to use the loaded model and tokenizer
        self.summarizer = pipeline(
            "summarization",
            model=self.model,
            tokenizer=self.tokenizer,
            device=self.device
        )

    def clean_text(self, text):
        """Clean and preprocess text"""
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove page numbers
        text = re.sub(r'\b\d+\b(?!\s*[a-zA-Z])', '', text)
        # Remove references and citations
        text = re.sub(r'\[\d+\]', '', text)
        # Remove special characters but keep periods
        text = re.sub(r'[^a-zA-Z0-9\s\.,;:]', '', text)
        return text.strip()

    def extract_text_from_pdf(self, file_obj):
        """Extract text from all pages of PDF file"""
        try:
            pdf_reader = PyPDF2.PdfReader(file_obj)
            text = ""

            # Process each page
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"

            # Clean extracted text
            text = self.clean_text(text)
            return text

        except Exception as e:
            raise Exception(f"Error reading PDF file: {str(e)}")

    def chunk_text(self, text, max_chunk_size=500):  # Reduced chunk size
        """Split text into smaller chunks while preserving sentence integrity"""
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence_length = len(word_tokenize(sentence))

            if current_length + sentence_length > max_chunk_size:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                current_chunk = [sentence]
                current_length = sentence_length
            else:
                current_chunk.append(sentence)
                current_length += sentence_length

        if current_chunk:
            chunks.append(" ".join(current_chunk))

        return chunks

    def generate_comprehensive_summary(self, text, min_length=200, max_length=500):
        try:
            if not text.strip():
                return "Error: No text provided for summarization."

            # Clean memory
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            gc.collect()

            # Split into chunks and summarize
            chunks = self.chunk_text(text)
            summaries = []

            for chunk in chunks:
                if len(chunk.split()) < 50:  # Skip very small chunks
                    continue

                try:
                    summary = self.summarizer(
                        chunk,
                        max_length=min(max_length, len(chunk.split()) - 1),  # Ensure max_length is less than input
                        min_length=min(min_length, len(chunk.split()) - 1),  # Ensure min_length is less than input
                        do_sample=False
                    )
                    if summary and len(summary) > 0:
                        summaries.append(summary[0]['summary_text'])
                except Exception as chunk_error:
                    print(f"Error processing chunk: {str(chunk_error)}")
                    continue

            # Combine summaries
            final_summary = " ".join(summaries)

            # Clean memory
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            gc.collect()

            return final_summary if final_summary else "Could not generate summary. Please check the input text."

        except Exception as e:
            return f"Error in generate_summary: {str(e)}"

def create_interface():
    summarizer = None
    try:
        summarizer = ComprehensiveSummarizer()
    except Exception as e:
        print(f"Error initializing summarizer: {str(e)}")
        return gr.Interface(
            fn=lambda x: f"Error initializing model: {str(e)}",
            inputs="text",
            outputs="text",
            title="Error"
        )

    def process_input(file_obj, text_input, min_length, max_length):
        try:
            if file_obj is not None:
                file_extension = file_obj.name.lower().split('.')[-1]

                if file_extension == 'pdf':
                    text = summarizer.extract_text_from_pdf(file_obj)
                else:
                    # Read text file content as bytes and decode
                    text = file_obj.read()
                    text = text.decode('utf-8') if isinstance(text, bytes) else text
                    text = summarizer.clean_text(text)
            elif text_input and text_input.strip():
                text = text_input
            else:
                return "Please either upload a file or enter text to summarize."

            if len(text.strip()) < 100:
                return "Input text is too short. Please provide more content to summarize."

            summary = summarizer.generate_comprehensive_summary(text, min_length, max_length)
            return summary

        except Exception as e:
            return f"Error processing input: {str(e)}"

    # Create Gradio interface
    iface = gr.Interface(
        fn=process_input,
        inputs=[
            gr.File(label="Upload Research Paper (PDF or TXT)"),
            gr.Textbox(
                label="Or Paste Research Paper Text",
                lines=10,
                placeholder="Paste your research paper text here..."
            ),
            gr.Slider(
                minimum=100,
                maximum=300,
                step=50,
                value=150,
                label="Minimum summary length (words)"
            ),
            gr.Slider(
                minimum=300,
                maximum=600,
                step=50,
                value=400,
                label="Maximum summary length (words)"
            )
        ],
        outputs=gr.Textbox(label="Summary", lines=15),
        title="Research Paper Summarizer",
        description="Upload a research paper (PDF/TXT) or paste its text to generate a comprehensive summary.",
    )

    return iface

if __name__ == "__main__":
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    iface = create_interface()
    iface.launch(share=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://361812e393c640ee8f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
