In [None]:
!pip install pdfplumber requests pytesseract pdf2image transformers nltk

import pdfplumber
import pytesseract
from pdf2image import convert_from_path
from transformers import pipeline
import re
import requests
import nltk

# Download necessary NLTK data
nltk.download('punkt')

# Initialize the summarizer pipeline
summarizer = pipeline("summarization")

# Function to download and save the PDF
def download_pdf(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            file.write(response.content)
        print(f"PDF downloaded successfully and saved as '{filename}'")
    else:
        print("Failed to download the PDF.")

# Function to perform OCR on an image and extract text
def ocr_image(image):
    return pytesseract.image_to_string(image)

# Function to extract text from PDF using pdfplumber and OCR where necessary
def extract_pdf_text(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            page_text = page.extract_text()

            # If text is empty, use OCR on page image
            if not page_text:
                images = convert_from_path(pdf_path, first_page=page_num + 1, last_page=page_num + 1)
                for img in images:
                    page_text = ocr_image(img)

            text += page_text + "\n" if page_text else ''

    return text

# Function to find all likely chapter titles in the PDF
def find_chapter_titles(full_text):
    titles = re.findall(r'\n([A-Z][A-Za-z0-9\s:,.()\-&]+(?:Chapter|CHAPTER|Section|SECTION)?[A-Za-z0-9\s]*)\n', full_text)
    return [title.strip() for title in titles]

# Function to summarize text using NLP
def generate_summary(text):
    if len(text) < 50:
        return "Content is too short to summarize."
    summary = summarizer(text, max_length=150, min_length=30, do_sample=False)
    return summary[0]['summary_text']

# Function to get chapter text and summarize it
def generate_chapter_summary(chapter_title, full_text):
    pattern = re.compile(rf"{re.escape(chapter_title)}(.*?)(?=\n[A-Za-z0-9\s:,.()\-&]+(?:Chapter|CHAPTER|Section|SECTION)?\n|$)", re.DOTALL)
    match = pattern.search(full_text)

    if match:
        chapter_content = match.group(1).strip()
        return generate_summary(chapter_content)
    else:
        return "Sorry, I couldn't find the chapter you're looking for."

# Main function to start bot and process chapters
def start_bot(pdf_path, url):
    download_pdf(url, pdf_path)

    # Extract text from PDF with OCR assistance
    pdf_text = extract_pdf_text(pdf_path)

    # Detect and display chapter titles
    chapter_titles = find_chapter_titles(pdf_text)
    print("Available chapters:", chapter_titles)

    # Interactive bot loop
    print("Hello! I am the Retrieval Learning Bot. Type the name of a chapter to get a summary. Type 'bye' to end.")

    while True:
        user_input = input("You: ").strip()

        if user_input.lower() == 'bye':
            print("Bot: Goodbye!")
            break

        # Generate summary for the requested chapter
        summary = generate_chapter_summary(user_input, pdf_text)
        print("Bot:", summary)

# Specify the PDF URL and path
pdf_url = 'https://www.iitp.ac.in/~ai-nlp-ml/resources/talks/mlppt.pdf'
pdf_path = 'mlppt.pdf'

# Start the bot
start_bot(pdf_path, pdf_url)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


PDF downloaded successfully and saved as 'mlppt.pdf'
Available chapters: ['MACHINE LEARNING', 'APPLICATIONS', 'SUPERVISED LEARNING', 'REGRESSION PROBLEM', 'CLASSIFICATION PROBLEM', 'UNSUPERVISED LEARENING', 'Unsupervised Learning.\nSupervised Learning\n\nx\nx', 'Q\n\nXy\n\nX2\n\nUnsupervised Learning\n\n6', 'O\n\nXy\n\x0c\nLINEAR REGRESSION', 'LINEAR REGRESSION', 'MODEL REPRESENTATION\nMODEL REPRESENTATION\nMODEL REPRESENTATION', 'MODEL REPRESENTATION\nGiven a training set ,\nHypothesis :\nwhere\nHow to choose\nMODEL REPRESENTATION\nCOST FUNCTION\nIdea : Choose , so that is as close to y\nfor our training examples (x , y) .\nCOST FUNCTION\nSo going with the idea ,\nIf hypothesis :\nWe want to minimise the squared error function\ni.e. i.e the difference\nbetween the predicted value ( ) and the\nactual label (y) should be as minimum as\npossible.\nCOST FUNCTION\nTherefore, the cost function becomes\nwhere summation( ) represents sum over my\ntraining set from i to m .\nand where ( ) repr

Your max_length is set to 150, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)


Bot:  It is the science of getting computer to learn without being programmed to learn . It is also the science to get computers to learn more quickly and more easily .
You: APPICATIONS
Bot: Sorry, I couldn't find the chapter you're looking for.
You: APPLICATIONS


Your max_length is set to 150, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)


Bot:  • Search Engines like Google, Bing etc. • Facebook photo tagging application. • Self Customizing Programs and many more. • Search engines such as Google and Facebook .
You: Bye
Bot: Goodbye!


In [None]:
# Install Poppler in Google Colab
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.5 [186 kB]
Fetched 186 kB in 0s (396 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 123623 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.5_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.5) ...
Setting up poppler-utils (22.02.0-2ubuntu0.5) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
!apt-get install -y tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 2s (2,747 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123653 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-