<a href="https://colab.research.google.com/github/Rishav03Raj/Automated_text_summarization/blob/main/summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Automatic Text Summarization

**Import Libraries**

In [8]:
# Import necessary libraries
import re
import nltk
import heapq
from string import punctuation
from google.colab import files
import io

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
# Install the pypdf library to handle PDF files.# Install required libraries for PDF, AI, and article scraping
!pip install pypdf
!pip install -q -U google-generativeai
!pip install newspaper3k
!pip install pypdf



In [3]:
# Import the specialized libraries
from pypdf import PdfReader
import google.generativeai as genai
# Uninstall and reinstall newspaper3k to ensure correct dependency
!pip uninstall -y newspaper3k
!pip install newspaper3k
# Uninstall existing lxml and lxml_html_clean and install lxml with html_clean extra
!pip uninstall -y lxml lxml_html_clean
!pip install lxml[html_clean]
from newspaper import Article # This is the advanced scraper we will use

Found existing installation: newspaper3k 0.2.8
Uninstalling newspaper3k-0.2.8:
  Successfully uninstalled newspaper3k-0.2.8
Collecting newspaper3k
  Using cached newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Using cached newspaper3k-0.2.8-py3-none-any.whl (211 kB)
Installing collected packages: newspaper3k
Successfully installed newspaper3k-0.2.8
Found existing installation: lxml 6.0.1
Uninstalling lxml-6.0.1:
  Successfully uninstalled lxml-6.0.1
Found existing installation: lxml_html_clean 0.4.2
Uninstalling lxml_html_clean-0.4.2:
  Successfully uninstalled lxml_html_clean-0.4.2
Collecting lxml[html_clean]
  Using cached lxml-6.0.1-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting lxml_html_clean (from lxml[html_clean])
  Using cached lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Using cached lxml-6.0.1-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl (5.3 MB)
Using cached lxml_html_clean-0.4.2-py3-none-any.whl (14 kB

**2. Configure the Gemini API (for AI Summary Option)**

In [4]:
# IMPORTANT: Paste your Gemini API key here.
# Get your key from https://aistudio.google.com/

from google.colab import userdata

# It's recommended to use Colab Secrets for your API key.
# Click the key icon on the left sidebar to add a new secret named 'API_KEY'.
API_KEY = userdata.get('API_KEY')

genai.configure(api_key=API_KEY)

**Choose Summarization Method**

In [5]:
# Ask the user to choose between summarizing from a URL or uploading a file.
source_choice = input("Enter '1' to summarize from a URL, or '2' to upload a file: ")
article_text = ""

Enter '1' to summarize from a URL, or '2' to upload a file: 1


**Fetch and Parse the Article**

In [6]:
# --- Step 1: Get the Text from the Source ---

# If the user chose to summarize from a URL.
if source_choice == '1':
    article_url = input("Please enter the article URL: ")
    try:
        # Use the newspaper3k library to download and parse the article.
        article = Article(article_url)
        article.download()
        article.parse()
        # Extract the main text from the parsed article.
        article_text = article.text
    except Exception as e:
        print(f"Error scraping the URL: {e}")
        article_text = ""

# If the user chose to upload a file.
elif source_choice == '2':
    print("Please upload a .txt or .pdf file.")
    uploaded = files.upload()
    for file_name in uploaded.keys():
        if file_name.endswith('.txt'):
            article_text = uploaded[file_name].decode('utf-8')
        elif file_name.endswith('.pdf'):
            pdf_file = io.BytesIO(uploaded[file_name])
            reader = PdfReader(pdf_file)
            for page in reader.pages:
                 article_text += page.extract_text()

# Handle invalid input.
else:
    print("Invalid choice for source.")

# --- Step 2: Clean the Extracted Text ---
if article_text:
    article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
    article_text = re.sub(r'\s+', ' ', article_text)

Please enter the article URL: https://www.geeksforgeeks.org/dsa/longest-palindromic-substring/


**5. Choose Summarization Type and Generate Summary**

In [9]:
if article_text:
    # Ask user for summary type
    summary_choice = input("Enter '1' for a Fast Summary, or '2' for an AI Summary: ")

    # --- Option 1: Fast (Frequency-Based) Summary ---
    if summary_choice == '1':
        print("\n--- GENERATING FAST SUMMARY ---")
        # 1. Preprocess for frequency analysis
        formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text)
        formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

        # 2. Tokenize and calculate word frequencies
        sentence_list = nltk.sent_tokenize(article_text)
        stopwords = nltk.corpus.stopwords.words('english')
        word_frequencies = {}
        for word in nltk.word_tokenize(formatted_article_text):
            if word.lower() not in stopwords and word.lower() not in punctuation:
                if word not in word_frequencies.keys():
                    word_frequencies[word] = 1
                else:
                    word_frequencies[word] += 1

        # 3. Calculate weighted frequencies
        if word_frequencies:
            maximum_frequency = max(word_frequencies.values())
            for word in word_frequencies.keys():
                word_frequencies[word] = (word_frequencies[word] / maximum_frequency)

        # 4. Score sentences
        sentence_scores = {}
        for sent in sentence_list:
            for word in nltk.word_tokenize(sent.lower()):
                if word in word_frequencies.keys():
                    if len(sent.split(' ')) < 30:
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_frequencies[word]
                        else:
                            sentence_scores[sent] += word_frequencies[word]

        # 5. Get top sentences
        if sentence_scores:
            summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)
            summary = ' '.join(summary_sentences)
            print("\n--- YOUR FAST SUMMARY ---\n")
            print(summary)
        else:
            print("Could not generate a fast summary. The text may be too short or lack sufficient English content.")


    # --- Option 2: AI (Gemini) Summary ---
    elif summary_choice == '2':
        print("\n--- GENERATING AI SUMMARY (this may take a moment) ---")
        try:
            # Use a valid model name. You can replace 'gemini-1.5-flash-latest' with another available model from the list above.
            model = genai.GenerativeModel('gemini-1.5-flash-latest')

            prompt = "Analyze the following text from a page. First, identify the page's purpose (e.g., news article, product page). Then, provide a concise summary of its key information. Text:\n\n" + article_text
            response = model.generate_content(prompt)
            print("\n--- YOUR AI SUMMARY ---\n")
            print(response.text)
        except Exception as e:
            print(f"An error occurred with the AI model: {e}")
            print("Please ensure your API key is correct and valid and that you are using an available model name.")


    # Handle invalid input
    else:
        print("Invalid choice for summary type.")
else:
    print("No text was extracted. Cannot generate a summary.")

Enter '1' for a Fast Summary, or '2' for an AI Summary: 1

--- GENERATING FAST SUMMARY ---

--- YOUR FAST SUMMARY ---

substring ( start , start + maxLen ); } public static void main ( String args ) { String s = "forgeeksskeegfor" ; System . substring ( start , start + maxLen ); } // Driver Code const s = "forgeeksskeegfor" ; console . Substring ( start , maxLen ); } public static void Main ( string args ) { string s = "forgeeksskeegfor" ; Console . Substring ( start , maxLen ); } static void Main ( string args ) { string s = "forgeeksskeegfor" ; Console . If the current palindrome length (high - low + 1) is greater than the previous maximum, update the starting index and max length. If the substring from i to j is not a palindrome, then the substring from i-1 to j+1 will also not be a palindrome. C++ #include <iostream> #include <vector> using namespace std ; string getLongestPal ( string s ) { int n = s .
