In [1]:
#%pip install PyPDF2
#%pip install pdfplumber
#%pip install PyMuPDF
#%pip install tika
#%pip install pdf2image
#%pip install pytesseract
#%pip install tiktoken


### PyPDF2 is a pure Python library that can extract text from PDF files. It is easy to use and works well for many simple PDF documents.


In [2]:
import PyPDF2

def extract_text_pypdf2(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

extracted_text_pypdf2 = extract_text_pypdf2("./data/world-war-one.pdf")
extracted_text_pypdf2

'World War I\ndrishtiias.com\n/printpdf/world-war-i-1\nWorld War I (WW I), also known as the Great War, lasted from 28 July 1914 to 11\nNovember 1918.\nWW I was fought between the Allied Powers and the Central Powers.\nThe main members of the \nAllied Powers\n were France, Russia, and Britain.\nThe United States also fought on the side of the Allies after 1917.\nThe main members of the \nCentral Powers\n were Germany, Austria-Hungary,\nthe Ottoman Empire, and Bulgaria.\nCauses of the War\nThere was no single event that led to World War I. The war happened because of several\ndifferent events that took place in the years building up to 1914.\nThe new international expansionist policy of Germany:\n In 1890 the new\nemperor of Germany, Wilhelm II, began an international policy that sought to turn\nhis country into a world power. Germany was seen as a threat by the other powers\nand destabilized the international situation.\nMutual Defense Alliances:\n Countries throughout Europe made mutu

### pdfplumber is another powerful library that offers more advanced features for extracting text, including handling complicated layouts and tables.

In [3]:
import pdfplumber

def extract_text_pdfplumber(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

extracted_text_pdfplumber = extract_text_pypdf2("./data/world-war-one.pdf")
extracted_text_pdfplumber

'World War I\ndrishtiias.com\n/printpdf/world-war-i-1\nWorld War I (WW I), also known as the Great War, lasted from 28 July 1914 to 11\nNovember 1918.\nWW I was fought between the Allied Powers and the Central Powers.\nThe main members of the \nAllied Powers\n were France, Russia, and Britain.\nThe United States also fought on the side of the Allies after 1917.\nThe main members of the \nCentral Powers\n were Germany, Austria-Hungary,\nthe Ottoman Empire, and Bulgaria.\nCauses of the War\nThere was no single event that led to World War I. The war happened because of several\ndifferent events that took place in the years building up to 1914.\nThe new international expansionist policy of Germany:\n In 1890 the new\nemperor of Germany, Wilhelm II, began an international policy that sought to turn\nhis country into a world power. Germany was seen as a threat by the other powers\nand destabilized the international situation.\nMutual Defense Alliances:\n Countries throughout Europe made mutu

### PyMuPDF, also known as fitz, provides a fast and efficient way to extract text and images from PDF files. It supports many features, including text extraction from specific areas.

In [4]:
import fitz  # PyMuPDF

def extract_text_pymupdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text() + "\n"
    return text

extracted_text_pymupdf = extract_text_pypdf2("./data/world-war-one.pdf")
extracted_text_pymupdf

'World War I\ndrishtiias.com\n/printpdf/world-war-i-1\nWorld War I (WW I), also known as the Great War, lasted from 28 July 1914 to 11\nNovember 1918.\nWW I was fought between the Allied Powers and the Central Powers.\nThe main members of the \nAllied Powers\n were France, Russia, and Britain.\nThe United States also fought on the side of the Allies after 1917.\nThe main members of the \nCentral Powers\n were Germany, Austria-Hungary,\nthe Ottoman Empire, and Bulgaria.\nCauses of the War\nThere was no single event that led to World War I. The war happened because of several\ndifferent events that took place in the years building up to 1914.\nThe new international expansionist policy of Germany:\n In 1890 the new\nemperor of Germany, Wilhelm II, began an international policy that sought to turn\nhis country into a world power. Germany was seen as a threat by the other powers\nand destabilized the international situation.\nMutual Defense Alliances:\n Countries throughout Europe made mutu

### Apache Tika is a content analysis toolkit that can extract text from various document types, including PDFs. 

In [5]:
from tika import parser

def extract_text_tika(pdf_path):
    parsed = parser.from_file(pdf_path)
    return parsed['content']

extracted_text_tika = extract_text_pypdf2("./data/world-war-one.pdf")
extracted_text_tika

'World War I\ndrishtiias.com\n/printpdf/world-war-i-1\nWorld War I (WW I), also known as the Great War, lasted from 28 July 1914 to 11\nNovember 1918.\nWW I was fought between the Allied Powers and the Central Powers.\nThe main members of the \nAllied Powers\n were France, Russia, and Britain.\nThe United States also fought on the side of the Allies after 1917.\nThe main members of the \nCentral Powers\n were Germany, Austria-Hungary,\nthe Ottoman Empire, and Bulgaria.\nCauses of the War\nThere was no single event that led to World War I. The war happened because of several\ndifferent events that took place in the years building up to 1914.\nThe new international expansionist policy of Germany:\n In 1890 the new\nemperor of Germany, Wilhelm II, began an international policy that sought to turn\nhis country into a world power. Germany was seen as a threat by the other powers\nand destabilized the international situation.\nMutual Defense Alliances:\n Countries throughout Europe made mutu

### If your PDF contains scanned images of text, you might need to use Optical Character Recognition (OCR) to extract the text. pytesseract can be used in conjunction with pdf2image to convert PDF pages to images first.

In [6]:
from pdf2image import convert_from_path
import pytesseract

def extract_text_ocr(pdf_path):
    text = ""
    images = convert_from_path(pdf_path)
    for image in images:
        text += pytesseract.image_to_string(image) + "\n"
    return text

extracted_text_ocr = extract_text_pypdf2("./data/world-war-one.pdf")
extracted_text_ocr

'World War I\ndrishtiias.com\n/printpdf/world-war-i-1\nWorld War I (WW I), also known as the Great War, lasted from 28 July 1914 to 11\nNovember 1918.\nWW I was fought between the Allied Powers and the Central Powers.\nThe main members of the \nAllied Powers\n were France, Russia, and Britain.\nThe United States also fought on the side of the Allies after 1917.\nThe main members of the \nCentral Powers\n were Germany, Austria-Hungary,\nthe Ottoman Empire, and Bulgaria.\nCauses of the War\nThere was no single event that led to World War I. The war happened because of several\ndifferent events that took place in the years building up to 1914.\nThe new international expansionist policy of Germany:\n In 1890 the new\nemperor of Germany, Wilhelm II, began an international policy that sought to turn\nhis country into a world power. Germany was seen as a threat by the other powers\nand destabilized the international situation.\nMutual Defense Alliances:\n Countries throughout Europe made mutu

### Calcuate the tokens of the extracted text

In [7]:
import tiktoken

def calculate_token_length(text):
    encoding = tiktoken.get_encoding("cl100k_base")
    encoded_text = encoding.encode(text)
    length_of_tokens = len(encoded_text)
    return length_of_tokens

extracted_text_list = [extracted_text_pypdf2, extracted_text_pdfplumber, extracted_text_pymupdf, extracted_text_tika, extracted_text_ocr]
print(f"Token length for pypdf2 is {calculate_token_length(extracted_text_pypdf2)}")
print(f"Token length for pdfplumber is {calculate_token_length(extracted_text_pdfplumber)}")
print(f"Token length for pymupdf is {calculate_token_length(extracted_text_pymupdf)}")
print(f"Token length for apache tika is {calculate_token_length(extracted_text_tika)}")
print(f"Token length for ocr is {calculate_token_length(extracted_text_ocr)}")
    

Token length for pypdf2 is 2143
Token length for pdfplumber is 2143
Token length for pymupdf is 2143
Token length for apache tika is 2143
Token length for ocr is 2143
