<a href="https://colab.research.google.com/github/ShyamRajana05/Infosys_Text_Summarizer/blob/main/Abs_Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Set up environment and install requirements
!pip install transformers PyPDF2 spacy torch streamlit pyngrok

# Import necessary libraries
import spacy
from transformers import pipeline
import PyPDF2
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import streamlit as st
from pyngrok import ngrok
import os
import multiprocessing
import time


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.3-py3-none-any.whl.metadata (8.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.meta

In [None]:
# Load SpaCy model for English
nlp = spacy.load("en_core_web_sm")

# Step 3: Initialize the summarization pipeline using a pre-trained model
def init_pipeline():
    try:
        # Specify the model explicitly
        summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
    except RuntimeError as e:
        return str(e)
    return summarizer

summarizer = init_pipeline()

if isinstance(summarizer, str):
    print('Error Initializing Pipeline:', summarizer)
else:
    print('Pipeline Initialized Successfully')

# Step 4: Function to summarize text using a hybrid approach with SpaCy
def hybrid_summarize(text):
    doc = nlp(text)
    sentences = list(doc.sents)

    if not sentences:
        return "No content to summarize."

    # Extractive summarization: Select the top 5 longest sentences
    extracted_sentences = sorted(sentences, key=lambda s: len(s), reverse=True)[:5]
    extracted_summary = ' '.join([s.text.strip() for s in extracted_sentences])

    # Adjust max and min lengths dynamically based on input size
    input_length = len(extracted_summary.split())
    max_length = min(512, input_length + 10)  # Allow larger summaries for longer texts
    min_length = int(0.2 * input_length)  # Ensure summary is not too short (at least 20% of input length)

    # Abstractive summarization on extracted content using beam search for better quality
    summary = summarizer(extracted_summary,
                         max_length=max_length,
                         min_length=min_length,
                         do_sample=False,
                         num_beams=4)  # Increase number of beams for better quality

    return summary[0]['summary_text']

# Step 5: Function to extract text from PDF
def extract_text_from_pdf(file):
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + '\n'
    return text

# Step 6: Write the Streamlit app code dynamically
with open('app.py', 'w') as f:
    f.write('''
import spacy
from transformers import pipeline
import PyPDF2
import streamlit as st

# Load SpaCy model for English
nlp = spacy.load("en_core_web_sm")

# Initialize the summarization pipeline using a pre-trained model
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Function to summarize text using a hybrid approach with SpaCy
def hybrid_summarize(text):
    doc = nlp(text)
    sentences = list(doc.sents)

    if not sentences:
        return "No content to summarize."

    # Extractive summarization: Select the top 5 longest sentences
    extracted_sentences = sorted(sentences, key=lambda s: len(s), reverse=True)[:5]
    extracted_summary = ' '.join([s.text.strip() for s in extracted_sentences])

    # Abstractive summarization on extracted content
    input_length = len(extracted_summary.split())
    max_length = min(512, input_length + 10)
    min_length = int(0.2 * input_length)  # 20% of input length
    summary = summarizer(extracted_summary,
                         max_length=max_length,
                         min_length=min_length,
                         do_sample=False,
                         num_beams=4)  # Improved summarization quality

    return summary[0]['summary_text']

# Function to extract text from PDF
def extract_text_from_pdf(file):
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + '\\n'
    return text

# Streamlit app setup
st.title("Abstract Text Summarizer")

# Choose input method (paste text or upload PDF)
option = st.radio("Choose an input method", ("Paste Text", "Upload PDF"))

# If user chooses to paste text
if option == "Paste Text":
    input_text = st.text_area("Enter the text you want to summarize:")
    if st.button("Summarize"):
        if input_text:
            with st.spinner('Generating summary...'):
                summary = hybrid_summarize(input_text)
            st.subheader("Summary")
            st.write(summary)

# If user chooses to upload a PDF
elif option == "Upload PDF":
    uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
    if uploaded_file is not None:
        pdf_text = extract_text_from_pdf(uploaded_file)
        st.subheader("PDF Content")
        st.write(pdf_text)
        if st.button("Summarize PDF"):
            with st.spinner('Generating summary...'):
                summary = hybrid_summarize(pdf_text)
            st.subheader("Summary")
            st.write(summary)
''')

# Step 7: Set up ngrok authentication
!ngrok authtoken '2q1TEP0sMns2s5a4qhkyuuKKGgU_FxQSL8YK1pSecg2zUwxH'

# Step 8: Run the Streamlit app in a subprocess
def run_streamlit():
    os.system('streamlit run app.py')

p = multiprocessing.Process(target=run_streamlit)
p.start()

# Step 9: Set up ngrok tunnel for external access
time.sleep(5)  # Give Streamlit time to start
try:
    # Correcting the ngrok connection for HTTP tunneling
    public_url = ngrok.connect(addr="8501", proto="http")
    print(f"Open this URL to view the app: {public_url}")
except Exception as e:
    print(f"Error connecting to ngrok: {e}")

# Step 10: Ensure Streamlit process stops after code execution ends
p.join()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Device set to use cpu


Pipeline Initialized Successfully
Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Open this URL to view the app: NgrokTunnel: "https://e6ae-35-197-14-235.ngrok-free.app" -> "http://localhost:8501"
