# Problem 1-3 Information Extraction from PDF Documents 

In [1]:
from pathlib import Path

## 1. Different methods for string extration from PDF

### Try two different methods

In [2]:
from PyPDF2 import PdfReader
def conversion_with_pypdf2(pdf_file):
    """Return a string with the text extracted from the given pdf_file using PyPDF2."""
    # copied from https://stackoverflow.com/questions/23821204/read-pdf-in-python-and-convert-to-text-in-pdf, first answer
    pypdf2_reader = PdfReader(pdf_file)
    text = ""
    for page in pypdf2_reader.pages:
        text += page.extract_text() + "\n"
    return text

import pypdfium2 as pdfium
def conversion_with_pypdfium2(pdf_file):
    """Return a string with the text extracted from the given pdf_file using PyPDFium2."""
    # copied from https://stackoverflow.com/questions/23821204/read-pdf-in-python-and-convert-to-text-in-pdf, first answer
    pdf = pdfium.PdfDocument(pdf_file.as_posix())
    text = ""
    for i in range(len(pdf)):
        page = pdf.get_page(i)
        textpage = page.get_textpage()
        text += textpage.get_text()
        text += "\n"
        textpage.close()
        page.close()
    pdf.close()
    return text

def read_pdf_file_write_txt(dest_dir, conversion_method):
    """Extract the text from the PDF files in the subdirectories of "./PDF_to_process".
    
    The method expexts the PDF files in (direct) subdirectories of "./PDF_to_process",
    each subdirectory for one group (e.g. "flyers").
    It writes the extracted text as text files in corresponding subdirectories of dest_dir.
    It creates dest_dir and its subdirectories if necessary.
    
    Parameters:
    dest_dir : Path
        path where the results shall be stored
    conversion_method : Callable[[Path], string]
        function that accept a path to a pdf file and returns a string with extracted text 
    """
    pdf_files_dir = Path("./PDF_to_process")
    assert pdf_files_dir.is_dir()
    dest_dir.mkdir(exist_ok=True)
    for pdf_subdir in pdf_files_dir.iterdir():
        dest_subdir = dest_dir / pdf_subdir.name
        dest_subdir.mkdir(exist_ok=True)
        for pdf_file in pdf_subdir.iterdir():
            text = conversion_method(pdf_file)
            txt_file = dest_subdir / (pdf_file.stem + ".txt")
            txt_file.write_text(text)

In [3]:
pypdfium2_result_path = Path("./txt_created_with_pypdfium2")
pypdf2_result_path = Path("./txt_created_with_pypdf2")

In [4]:
read_pdf_file_write_txt(pypdf2_result_path, conversion_with_pypdf2)

In [5]:
read_pdf_file_write_txt(pypdfium2_result_path, conversion_with_pypdfium2)

### Quantitative analysis with <code>SequenceMatcher.ratio()</code>

In [6]:
def load_txt(path):
    """Return the content of the files in the subdirectories of the path given in the argument.
    
    It expects that there are subdirectories of path containing text files.
    It returns a dictionary mapping those subdirectory names to dictionaries
    that match the file names (without suffices) to strings with the content of the files,
    so the returned nested dictionary represents the directory structure.
    
    Parameters:
    path : Path
        path where the content should be loaded from
    
    Returns:
    dict
        a nested dictionary containing strings with the file contents
    """
    return {subdir.name:{txt_file.stem:txt_file.read_text() for txt_file in subdir.iterdir()}
              for subdir in path.iterdir()}

In [7]:
from difflib import SequenceMatcher
def compare_txt(dict_one, dict_two):
    """Compare the strings in the given nested dictionries.
    
    It expectes both dictionaries to be nested with one level,
    having identical keywords and that there are strings in the end.
    It compares theses strings using difflib.SequenceMatcher.ratio(...).
    
    Parameters:
    dict_one, dict_two: dict[str, dict[str, str]]
        nested dictionaries with identical structure containing the files to be compared
    
    Returns:
    dict[str, dict[str, float]]
        a nested dictionary with the same structure and the floats returned by
        difflib.SequenceMatcher.ratio(...)
    """
    return {group:{filename:SequenceMatcher(a=content, b=dict_two[group][filename]).ratio()
                   for filename, content in files.items()}
            for group, files in dict_one.items()}

In [8]:
pypdf2_results = load_txt(pypdf2_result_path)
pypdfium_results = load_txt(pypdfium2_result_path)
print("Similarity measure of the created files:")
compare_txt(pypdf2_results, pypdfium_results)

Similarity measure of the created files:


{'flyers': {'bundeswehr': 0.8889301466259725,
  'wegweiser_senioren': 0.9595438968062616,
  'bahnstadt': 0.9255838880301983},
 'scans': {'double_ocr': 0.9810289122690746, 'single_ocr': 0.9478320446314965},
 'iban': {'liste1': 0.9995294117647059}}

### Qualitative (and further quantitative) analysis

#### Flyers

SequenceMatcher finds the least similarity in the strings extracted from the Bundeswehr flyer.
The Bundeswehr flyer is actually a large table, containing lots of organizations, their addresses, phone numbers, and so on.
The cells of the table contain line breaks because their content wouldn't fit into the cells otherwise,
but often they don't have any semantic meaning.
However, both PyPDFium2 and PyPDf2 reproduces these line breaks in the extracted text.
Even worse, PyPDF2 does not introduce any line breaks between the cells, next to the line breaks within the cells, it only introduces line breaks between rows of the table.
This leads to severe problems:
For example, the last figure of the postal code of the address of the first tank division ("1. Panzerdivision") is 3, this figure is the only content of the last line in the corresponding address cell and its e-mail-address, given in the following cell is "1.PzDivPressestelle@Bundeswehr.org".
This leads to a highly misleading line "31.PzDivPressestelle@" in the created text.
(The second part of the e-mail address is given in the next line due to a line break in the PDF,
but that is the same with PyPDFium2.)
Consequently, the text file produced by PyPDFium2 has more lines (20'264 vs. 18'256).

The opposite is true for the Bahnstadt flyer (2703 vs. 3029).
The reason is probably that PyPDFium2 ignores linebreaks after a hyphen,
it replaces the hyphen with some character represendted as an orange dot in Jupyter Notebook.

According to SequenceMatcher, the Senioren-Wegweiser strings have the largest similarity.
However, the tendency of PyPDFium2 to rather use newlines, already seen in the Bundeswehr flyer, has an even stronger effect here: 3627 vs. 91 lines.
The flyer contains subsections, each with post and e-mail addresses and phone numbers of an instiution for elederly people.
The phone numbers and the parts of the postal addresses are given in one line each in the PDF document.
PyPDF puts them into one line, yet with spaces between them so that they don't get mixed up as it is the case with the Bundeswehr flyer.
However, the approach of PyPDFium2 to keep the linebreaks from the PDF
so that each phone number, e-mail address and so on stays in a single line is probably way better for further analysis like automated address extraction.

#### Scans

SequenceMatcher finds a higher similarity for the double_ocr scans than for all flyers.
Looking through them manually, one does not find any relevant differences, either.
The strings extracted from the single_ocr PDFs one big difference:
This time, PyPDF uses way more linebreaks (47 vs. 154)
but in neither cases, numbers are mixed up, contrary to the situation with the Bundeswehr flyer.

#### IBAN

Both methods create very similar strings.
The PDF is a table, like the Bundeswehr flyer.
In contrary to that, even PyPDFium2 does not set newlines for new table cells.
The order of table, footnotes and footer is inverted (relative to the PDF) in both extracted strings,
yet there is one important difference between the two methods:
PyPDF2 merges the the footnote number one with the the footer, ending with 2021, leading to a date "1. Januar 20211" (1st of January, 20211 ...).
That's the same problem as with the Bundeswehr flyer.

### Decision for one method

The files created with PyPDFium2 will be for the further tasks
since PyPDF2 mixes up numbers in tables or footers (particularly in the Bundeswehr flyer).

However, there is a problem with PyPDFium2:
Hyphens that were introduced not for semantic reasons but for line breaks are represented as "\x02" in the resulting string.
Therefore, this sequences must be removed before the string can be used.

In [9]:
txt_files_path = pypdfium2_result_path

## 3. Extract phone numbers, e-mail addresses and IBANs

### (i) Phone numbers

In [10]:
import re
# see last parapgraph under "Decision for a method"
substitute_prog = re.compile("\\x02")
txt_strings = {group:{filename:substitute_prog.sub("", content) for filename, content in files.items()}
              for group, files in load_txt(txt_files_path).items()}

ir_result_path = Path("./IR_results") # the extracted phone numbers, URLs and so on will be stored here
ir_result_path.mkdir(exist_ok=True)

def extract_and_write_results(txt_strings, result_filename, extraction_method):
    """Extract phone numbers, URLs etc. from the given strings and write them to disk.
    
    The method calls extraction_method with each string in txt_string as argument,
    puts the results in a set (so they will be unique)
    and writes them (one instance on a line) to the file under result_filename in ir_result_path.
    
    Parameters:
    txt_string : Iterable[str]
        iterable over the strings to process
    result_filename : str
        name of the file (will be overridden if existing) in ir_result_path
        where the results will be stored
    extraction_method: Callable[[str], Iterable[str]]
        function that extracts the phone numbers, URLs etc. from the string
    """
    result_list = []
    for raw_string in txt_strings:
        result_list.extend(extraction_method(raw_string))
    result_set = set(result_list)
    result_string = "\n".join(result_set)
    (ir_result_path / result_filename).write_text(result_string)

In [11]:
phone_numbers_prog = re.compile(r"(?:\+ ?|\(|0 6|0 7|0800-?|116 ?|062|0 ?17|0 ?18|0 ?71)(?:[0-9]*[\-()/ ]*)+\n?(?:-? ?[0-9]+[\- ]?)*[0-9]")
not_digits_prog = re.compile("[^0-9+]")

def extract_phone_numbers(raw_string):
    """Extract phone numbers from the raw_string.
    
    The returned phone numbers contain only digits and "+" (at the beginning like +496224172139).
    
    Parameters:
    raw_string : string
        where the phone numbers shall be extracted from
    
    Returns:
        an iterable over the found phone numbers
    """
    raw_results_list = phone_numbers_prog.findall(raw_string)
    only_digits = map(lambda raw_result: not_digits_prog.sub("", raw_result), raw_results_list)
    return filter(lambda number: len(number) > 5 or number in ["112", "110", "19222"], only_digits)

extract_and_write_results(txt_strings["flyers"].values(), "phone_numbers.txt", extract_phone_numbers)

### (ii) E-mail-addresses and URLs

In [12]:
e_mail_address_prog = re.compile(r"(?:(?!\d\n)[\w\n.-])+@(?:(?!www)[\w\n.-])+\.(?:de|com|org|[a-z\n]+[a-z])(?:/|\s|,|\+)")

def extract_e_mail_addresses(raw_string):
    """Extract E-mail addresses from the raw string.
    
    Parameters:
    raw_string : string
        where the phone numbers shall be extracted from
    
    Returns:
        an iterable over the found phone numbers
    """
    raw_result_list = e_mail_address_prog.findall(raw_string)
    def remove_bad_substrings(before):
        return before[:-1].replace("Heidelberg\n", "").replace("E-Mail\n", "").replace("\n", "")
    return map(remove_bad_substrings, raw_result_list)

extract_and_write_results(txt_strings["flyers"].values(), "e_mail_addresses.txt", extract_e_mail_addresses)

In [13]:
url_prog = re.compile(r"(?:http://(?:https://)?|www)[\w\n.-]+\.(?:de|com|org|[a-z\n]+[a-z])(?:/|\s|,|\+)")

def extract_urls(raw_string):
    """Extract URLs from the raw string.
    
    The method accepts addresses like www.example.com although they are not valid URLs according to RFC 3986.
    However, it turns it to well-formatted ULS so in the example case the returned iterator would contain an entry:
    http://www.example.com
    
    Parameters:
    raw_string : string
        where the phone numbers shall be extracted from
    
    Returns:
        an iterable over the found phone numbers
    """
    result = []
    for raw_url in url_prog.findall(raw_string):
        url = raw_url[:-1].replace("\n", "")
        if url[0:15] == "http://https://":
            result.append("http://" + url[15:])
            result.append("https://" + url[15:])
        elif url[0:4] == "http":
            result.append(url)
        else:
            result.append("http://" + url)
    return result

extract_and_write_results(txt_strings["flyers"].values(), "URLs.txt", extract_urls)

### (iii) IBANs

In [14]:
iban_prog = re.compile("[A-Z]{2}[0-9]{2} (?:[A-Z0-9]{4} )*[0-9]{1,4}")
extract_and_write_results(txt_strings["iban"].values(), "IBAN.txt", lambda raw_string: iban_prog.findall(raw_string))

## 4. Use the method from 3. (i) one a new text

In [15]:
list(extract_phone_numbers(txt_strings["scans"]["single_ocr"]))

['01791173263',
 '062219831331',
 '01788957264',
 '01712819736',
 '017632137783',
 '01751699490',
 '018122008708',
 '01707781800',
 '01726239609',
 '01775987162',
 '017623232774',
 '01773434363',
 '01711734982']

In [16]:
list(extract_phone_numbers(txt_strings["scans"]["double_ocr"]))

['062211383613',
 '0622143',
 '062211383620',
 '06221140714',
 '062215850930',
 '0622197370',
 '0622150259595',
 '0622160430',
 '06221604360',
 '06202859430']

Given that the scans were made from phone books, it's rather disappointing that only so few phone numbers were found.
On the other hand, this is not a surprise since the regexes are adopted to the flyers.