In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install streamlit
!pip install transformers
!pip install datasets nltk
!pip install datasets --upgrade

Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[

In [None]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=6b14f9663e208dee9c0e8130163cd5204429380436c7a7309bb065e83361f91f
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
%%writefile app.py
import streamlit as st
from transformers import T5Tokenizer, T5ForConditionalGeneration
import re

# Load your fine-tuned T5 model and tokenizer
@st.cache_resource
def load_model():
    model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/summarization_model")  # Replace with your model path
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    return model, tokenizer

@st.cache_resource
def load_model2():
    model2 = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/model2")  # Replace with your model path
    tokenizer2 = T5Tokenizer.from_pretrained("t5-small")
    return model2, tokenizer2

def preprocess_article(raw_text):
    text = raw_text.strip()
    # Normalize whitespace and remove URLs
    text = re.sub(r"\s+", " ", re.sub(r"http\S+|www\S+|https\S+", "", text))
    # Use semantic tags for context (optional but can help models focus)
    return f"<summary> {text} </summary>"

def chunk_text_with_overlap(text, tokenizer, chunk_size=400, overlap=50):
    tokens = tokenizer.encode("summarize: " + text, return_tensors="pt")[0]
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(chunk)
        if i + chunk_size >= len(tokens):  # Avoid breaking at the last part
            break
    return chunks

def summarize_long_text(text, model, tokenizer, chunk_size=400):
    chunks = chunk_text_with_overlap(text, tokenizer, chunk_size=chunk_size)
    summaries = []

    for chunk in chunks:
        chunk = chunk.unsqueeze(0)  # Add batch dimension
        summary_ids = model.generate(
            chunk,
            max_length=200,  # Ensure summaries are concise
            min_length=100,  # Encourage longer summaries than default
            num_beams=8,  # Increase diversity
            top_p=0.9,  # Add controlled randomness
            temperature=0.8,  # Adjust temperature for coherent variety
            length_penalty=1.2,  # Penalize overly short outputs
            early_stopping=True  # Stop generation early for more precise outputs
        )
        summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))

    # Post-process to clean and merge summaries
    final_summary = " ".join(summaries).strip()
    final_summary = re.sub(r"\s+", " ", final_summary)  # Normalize spaces
    final_summary = final_summary.replace("<summary>", "").replace("</summary>", "")
    return final_summary
def preprocess_article_with_context(raw_text):
    text = raw_text.strip()
    text = re.sub(r"\s+", " ", re.sub(r"http\S+|www\S+|https\S+", "", text))
    return f"<summary> {text} </summary>"  # Add semantic tags
def remove_repeated_sentences(text):
    sentences = text.split(". ")
    seen = set()
    result = []
    for sentence in sentences:
        if sentence not in seen:
            result.append(sentence)
            seen.add(sentence)
    return ". ".join(result)
def clean_summary(summary):
    # Remove unwanted punctuation artifacts
    summary = re.sub(r"^[,.\s]+", "", summary)  # Remove leading punctuation
    summary = re.sub(r"[,.\s]+$", "", summary)  # Remove trailing punctuation
    summary = re.sub(r"\s+", " ", summary).strip()  # Normalize whitespace
    return summary
def summarize_model2(text, model, tokenizer):
    # Preprocess input text
    text = preprocess_article_with_context(text)

    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate summary with tuned parameters
    outputs = model.generate(
        inputs.input_ids,
        max_length=150,
        min_length=50,
        num_beams=6,
        no_repeat_ngram_size=3,
        length_penalty=1.5,
        early_stopping=True,
        temperature=0.7,
        top_p=0.95
    )

    # Decode and clean output
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    summary = remove_repeated_sentences(summary)
    summary = clean_summary(summary)
    return summary




# Streamlit App
st.title("Text Summarizer")
st.subheader("Enter a article to find its summary.")

# Input text area
input_text = st.text_area("Enter the article to summarize here", height=300)
uploaded_file = st.file_uploader("Or upload a text file", type="txt")

if uploaded_file is not None:
    input_text = uploaded_file.read().decode("utf-8")  # Decode file content to string


# Load model and tokenizer
model, tokenizer = load_model()
# Load model and tokenizer
model2, tokenizer2 = load_model2()
# Summarize button
# Dropdown for selecting summary type
summary_type = st.selectbox("Select the type of summary:", ["Short summary", "Long summary"])

if summary_type == "Short summary":
    if input_text.strip():
        st.write("Short summary: ")
        try:
            # Preprocess and summarize
            processed_text = preprocess_article(input_text)
            summary = summarize_long_text(processed_text, model, tokenizer)
            summary = summary.replace("title>", "")
            summary = summary.replace("/summary>", "")
            summary = remove_repeated_sentences(summary)
            st.write(summary)
        except Exception as e:
            st.error(f"An error occurred: {e}")
    else:
        st.warning("Please enter some text to summarize.")

elif summary_type == "Long summary":
    if input_text.strip():
        st.write("Long summary: ")
        try:
            # Preprocess and summarize
            summary = summarize_model2(input_text, model2, tokenizer2)
            summary = summary.replace("/summary>", "")
            st.write(summary)
        except Exception as e:
            st.error(f"An error occurred: {e}")
    else:
        st.warning("Please enter some text to summarize.")



Writing app.py


In [None]:
!ngrok authtoken 2pyyrJpCg7XgQ3r43cdYdRVb9xk_2SVnaawhzpbsqU1iXyPDW

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
from pyngrok import ngrok
!streamlit run app.py &>/dev/null&
public_url = ngrok.connect(addr='8501')
print(f"Streamlit app is live at {public_url}")

Streamlit app is live at NgrokTunnel: "https://cf68-34-125-56-189.ngrok-free.app" -> "http://localhost:8501"


In [None]:
!pip install streamlit pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.2-py3-none-any.whl.metadata (8.4 kB)
Downloading pyngrok-7.2.2-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.2


In [None]:
text = "This is the first sentence. This is the second sentence. This is the third sentence."

# Split the text into sentences
sentences = text.split(". ")

# Tagging the first sentence with <first> and the rest with <body>
tagged_text = "<first> " + sentences[0] +". "
tagged_text += "<body> " + ". ".join(sentences[1:]) + " "

# Output the tagged text
print(tagged_text)

<first> This is the first sentence. <body> This is the second sentence. This is the third sentence. 
