In [15]:
import pdfplumber
from urllib.parse import urlparse

def is_valid_url(string):
    parsed_url = urlparse(string)
    return all([parsed_url.scheme, parsed_url.netloc])

def extract_links_from_text(text, page_num):
    links = []
    unique_urls = set()
    words = text.split()
    for word in words:
        if is_valid_url(word):
            url = word.strip()
            if url not in unique_urls:
                links.append({
                    "url": url,
                    "page": page_num + 1
                })
                unique_urls.add(url)
    return links

def extract_links(pdf_file_path):
    links = []
    with pdfplumber.open(pdf_file_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()
            links.extend(extract_links_from_text(text, page_num))
    return links


input_pdf_file = "e1.pdf"

extracted_links = extract_links(input_pdf_file)

for link_info in extracted_links:
    print(f"URL: {link_info['url']}")
    print(f"Page: {link_info['page']}")
    print()


URL: https://unt.instructure.com/login/ldap
Page: 1

URL: https://joinhandshake.com/
Page: 1

URL: https://example.com/path?param=value#fragment
Page: 1



In [17]:
import fitz

def extract_footer(pdf_file_path, min_font_size=10, max_font_size=14):
    footers = []

    pdf_document = fitz.open(pdf_file_path)

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        page_height = page.rect.height - 100

        page_text = page.get_text("dict")

        for block in page_text.get("blocks", []):
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    text = span["text"]
                    font_size = span["size"]
                    y0 = span["bbox"][1]
                    if y0 >= page_height:
                        footers.append(text.strip())

    pdf_document.close()
    return footers

input_pdf_file = "pdf1.pdf"

extracted_footers = extract_footer(input_pdf_file)

for footer in extracted_footers:
    print(footer)


Chapter 2 Software Processes
1
Updated 06/06/2021
Chapter 2 Software Processes
2
Updated 06/06/2021
some particular perspective.
Chapter 2 Software Processes
3
Updated 06/06/2021
Chapter 2 Software Processes
4
Updated 06/06/2021
Updated 06/06/2021
Chapter 2 Software Processes
5
Graphical representation of the various stages of a typical SDLC
Chapter 2 Software Processes
6
Updated 06/06/2021
Chapter 2 Software Processes
7
Updated 06/06/2021
Updated 06/06/2021
Chapter 2 Software Processes
8
Chapter 2 Software Processes
9
Updated 06/06/2021
process begins only if the previous phase is complete.
Updated 06/06/2021
Chapter 2 Software Processes
10
Chapter 2 Software Processes
11
Updated 06/06/2021
process that incorporates elements from all of these
models.
Chapter 2 Software Processes
12
Updated 06/06/2021
Updated 06/06/2021
Chapter 2 Software Processes
13
Updated 06/06/2021
Chapter 2 Software Processes
14
• Process and results are well documented.
Updated 06/06/2021
Chapter 2 Software Proc

In [19]:
import pdfplumber
from urllib.parse import urlparse

class UniqueURLs:
    def __init__(self):
        self.urls = set()

    def add(self, url):
        self.urls.add(url)

    def contains(self, url):
        return url in self.urls

def is_valid_url(string):
    parsed_url = urlparse(string)
    return all([parsed_url.scheme, parsed_url.netloc])

def extract_links_from_text(text, page_num, unique_urls):
    links = []
    words = text.split()
    for word in words:
        if is_valid_url(word):
            url = word.strip()
            if not unique_urls.contains(url):
                links.append({
                    "url": url,
                    "page": page_num + 1
                })
                unique_urls.add(url)
    return links

def extract_links(pdf_file_path):
    links = []
    unique_urls = UniqueURLs()
    with pdfplumber.open(pdf_file_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()
            links.extend(extract_links_from_text(text, page_num, unique_urls))
    return links

input_pdf_file = "e1.pdf"

extracted_links = extract_links(input_pdf_file)

for link_info in extracted_links:
    print(f"URL: {link_info['url']}")
    print(f"Page: {link_info['page']}")
    print()


URL: https://unt.instructure.com/login/ldap
Page: 1

URL: https://joinhandshake.com/
Page: 1

URL: https://example.com/path?param=value#fragment
Page: 1



In [None]:
import fitz
from PIL import Image
import io
import imagehash
from IPython.display import display
import os
import pytesseract
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import cv2

def display_non_duplicate_images_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    unique_image_hashes = {}

    for page_number in range(pdf_document.page_count):
        page = pdf_document.load_page(page_number)

        image_list = page.get_images(full=True)
        for img_index, image_info in enumerate(image_list):
            xref = image_info[0]
            base_image_data = pdf_document.extract_image(xref)
            image = Image.open(io.BytesIO(base_image_data["image"]))
            image_hash = imagehash.phash(image)
            if image_hash in unique_image_hashes:
                continue
            else:
                unique_image_hashes[image_hash] = None
                display(image)

    pdf_document.close()

pdf_path = 'pdf1.pdf'
display_non_duplicate_images_from_pdf(pdf_path)
