In [15]:
# %pip install PyPDF2
# %pip install pdfplumber
# %pip install PyMuPDF
# %pip install tika
# %pip install pdf2image
# %pip install pytesseract
# %pip install tiktoken
# %pip install unstructured
# Below are the dependent files of unstructured
# %pip install pi_heif
# %pip install unstructured_inference
# %pip install unstructured_pytesseract


### PyPDF2 is a pure Python library that can extract text from PDF files. It is easy to use and works well for many simple PDF documents.


In [16]:
import PyPDF2

def extract_text_pypdf2(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

extracted_text_pypdf2 = extract_text_pypdf2("./data/world-war-one.pdf")
extracted_text_pypdf2

'World War I\ndrishtiias.com\n/printpdf/world-war-i-1\nWorld War I (WW I), also known as the Great War, lasted from 28 July 1914 to 11\nNovember 1918.\nWW I was fought between the Allied Powers and the Central Powers.\nThe main members of the \nAllied Powers\n were France, Russia, and Britain.\nThe United States also fought on the side of the Allies after 1917.\nThe main members of the \nCentral Powers\n were Germany, Austria-Hungary,\nthe Ottoman Empire, and Bulgaria.\nCauses of the War\nThere was no single event that led to World War I. The war happened because of several\ndifferent events that took place in the years building up to 1914.\nThe new international expansionist policy of Germany:\n In 1890 the new\nemperor of Germany, Wilhelm II, began an international policy that sought to turn\nhis country into a world power. Germany was seen as a threat by the other powers\nand destabilized the international situation.\nMutual Defense Alliances:\n Countries throughout Europe made mutu

### pdfplumber is another powerful library that offers more advanced features for extracting text, including handling complicated layouts and tables.

In [17]:
import pdfplumber

def extract_text_pdfplumber(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

extracted_text_pdfplumber = extract_text_pypdf2("./data/world-war-one.pdf")
extracted_text_pdfplumber

'World War I\ndrishtiias.com\n/printpdf/world-war-i-1\nWorld War I (WW I), also known as the Great War, lasted from 28 July 1914 to 11\nNovember 1918.\nWW I was fought between the Allied Powers and the Central Powers.\nThe main members of the \nAllied Powers\n were France, Russia, and Britain.\nThe United States also fought on the side of the Allies after 1917.\nThe main members of the \nCentral Powers\n were Germany, Austria-Hungary,\nthe Ottoman Empire, and Bulgaria.\nCauses of the War\nThere was no single event that led to World War I. The war happened because of several\ndifferent events that took place in the years building up to 1914.\nThe new international expansionist policy of Germany:\n In 1890 the new\nemperor of Germany, Wilhelm II, began an international policy that sought to turn\nhis country into a world power. Germany was seen as a threat by the other powers\nand destabilized the international situation.\nMutual Defense Alliances:\n Countries throughout Europe made mutu

### PyMuPDF, also known as fitz, provides a fast and efficient way to extract text and images from PDF files. It supports many features, including text extraction from specific areas.

In [18]:
import fitz  # PyMuPDF

def extract_text_pymupdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text() + "\n"
    return text

extracted_text_pymupdf = extract_text_pypdf2("./data/world-war-one.pdf")
extracted_text_pymupdf

'World War I\ndrishtiias.com\n/printpdf/world-war-i-1\nWorld War I (WW I), also known as the Great War, lasted from 28 July 1914 to 11\nNovember 1918.\nWW I was fought between the Allied Powers and the Central Powers.\nThe main members of the \nAllied Powers\n were France, Russia, and Britain.\nThe United States also fought on the side of the Allies after 1917.\nThe main members of the \nCentral Powers\n were Germany, Austria-Hungary,\nthe Ottoman Empire, and Bulgaria.\nCauses of the War\nThere was no single event that led to World War I. The war happened because of several\ndifferent events that took place in the years building up to 1914.\nThe new international expansionist policy of Germany:\n In 1890 the new\nemperor of Germany, Wilhelm II, began an international policy that sought to turn\nhis country into a world power. Germany was seen as a threat by the other powers\nand destabilized the international situation.\nMutual Defense Alliances:\n Countries throughout Europe made mutu

### Apache Tika is a content analysis toolkit that can extract text from various document types, including PDFs. 

In [19]:
from tika import parser

def extract_text_tika(pdf_path):
    parsed = parser.from_file(pdf_path)
    return parsed['content']

extracted_text_tika = extract_text_pypdf2("./data/world-war-one.pdf")
extracted_text_tika

'World War I\ndrishtiias.com\n/printpdf/world-war-i-1\nWorld War I (WW I), also known as the Great War, lasted from 28 July 1914 to 11\nNovember 1918.\nWW I was fought between the Allied Powers and the Central Powers.\nThe main members of the \nAllied Powers\n were France, Russia, and Britain.\nThe United States also fought on the side of the Allies after 1917.\nThe main members of the \nCentral Powers\n were Germany, Austria-Hungary,\nthe Ottoman Empire, and Bulgaria.\nCauses of the War\nThere was no single event that led to World War I. The war happened because of several\ndifferent events that took place in the years building up to 1914.\nThe new international expansionist policy of Germany:\n In 1890 the new\nemperor of Germany, Wilhelm II, began an international policy that sought to turn\nhis country into a world power. Germany was seen as a threat by the other powers\nand destabilized the international situation.\nMutual Defense Alliances:\n Countries throughout Europe made mutu

### If your PDF contains scanned images of text, you might need to use Optical Character Recognition (OCR) to extract the text. pytesseract can be used in conjunction with pdf2image to convert PDF pages to images first.

In [20]:
from pdf2image import convert_from_path
import pytesseract

def extract_text_ocr(pdf_path):
    text = ""
    images = convert_from_path(pdf_path)
    for image in images:
        text += pytesseract.image_to_string(image) + "\n"
    return text

extracted_text_ocr = extract_text_pypdf2("./data/world-war-one.pdf")
extracted_text_ocr

'World War I\ndrishtiias.com\n/printpdf/world-war-i-1\nWorld War I (WW I), also known as the Great War, lasted from 28 July 1914 to 11\nNovember 1918.\nWW I was fought between the Allied Powers and the Central Powers.\nThe main members of the \nAllied Powers\n were France, Russia, and Britain.\nThe United States also fought on the side of the Allies after 1917.\nThe main members of the \nCentral Powers\n were Germany, Austria-Hungary,\nthe Ottoman Empire, and Bulgaria.\nCauses of the War\nThere was no single event that led to World War I. The war happened because of several\ndifferent events that took place in the years building up to 1914.\nThe new international expansionist policy of Germany:\n In 1890 the new\nemperor of Germany, Wilhelm II, began an international policy that sought to turn\nhis country into a world power. Germany was seen as a threat by the other powers\nand destabilized the international situation.\nMutual Defense Alliances:\n Countries throughout Europe made mutu

### The unstructured package is a powerful tool for processing and extracting useful information from various document types, including PDFs. It simplifies the task of parsing documents by providing a straightforward interface to extract structured data from unstructured content. The partition_pdf function specifically helps to break down a PDF into its constituent elements, such as text, images, and tables, allowing for more focused analysis and manipulation of the data.

In [21]:
from unstructured.partition.pdf import partition_pdf
# Returns a List[Element] present in the pages of the parsed pdf document
def extract_text_unstructured(pdf_path):
    elements = partition_pdf(pdf_path)
    text_parts = [element.text for element in elements]
    complete_text = "\n".join(text_parts)
    return complete_text

extracted_text_unstructured = extract_text_unstructured("./data/world-war-one.pdf")
extracted_text_unstructured

'Orishti World War I |\n¢ World War I (WW J), also known as the Great War, lasted from 28 July 1914 to 11 November 1918.\n¢ WW Iwas fought between the Allied Powers and the Central Powers. © The main members of the Allied Powers were France, Russia, and Britain. The United States also fought on the side of the Allies after 1917.\n© The main members of the Central Powers were Germany, Austria-Hungary, the Ottoman Empire, and Bulgaria.\nCauses of the War\nThere was no single event that led to World War I. The war happened because of several different events that took place in the years building up to 1914.\n¢ The new international expansionist policy of Germany: In 1890 the new emperor of Germany, Wilhelm II, began an international policy that sought to turn his country into a world power. Germany was seen as a threat by the other powers and destabilized the international situation. ¢ Mutual Defense Alliances: Countries throughout Europe made mutual defence agreements. These treaties mea

### Calcuate the tokens of the extracted text

In [22]:
import tiktoken

def calculate_token_length(text):
    encoding = tiktoken.get_encoding("cl100k_base")
    encoded_text = encoding.encode(text)
    length_of_tokens = len(encoded_text)
    return length_of_tokens

extracted_text_list = [extracted_text_pypdf2, extracted_text_pdfplumber, extracted_text_pymupdf, extracted_text_tika, extracted_text_ocr, extracted_text_unstructured]
print(f"Token length for pypdf2 is {calculate_token_length(extracted_text_pypdf2)}")
print(f"Token length for pdfplumber is {calculate_token_length(extracted_text_pdfplumber)}")
print(f"Token length for pymupdf is {calculate_token_length(extracted_text_pymupdf)}")
print(f"Token length for apache tika is {calculate_token_length(extracted_text_tika)}")
print(f"Token length for ocr is {calculate_token_length(extracted_text_ocr)}")
print(f"Token length for unstructured is {calculate_token_length(extracted_text_unstructured)}")

Token length for pypdf2 is 2143
Token length for pdfplumber is 2143
Token length for pymupdf is 2143
Token length for apache tika is 2143
Token length for ocr is 2143
Token length for unstructured is 2062
