In [None]:
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250506


In [3]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import pandas as pd
import json
import re
import os
import openai
from openai import OpenAI
import time
import ast

Extract pages and page numbers

In [5]:
def extract_pages_text(pdf_path):
    """ Input: PDF files
        Output: list of pages as text(strings)
    """
    pages = []
    for page_layout in extract_pages(pdf_path):
        lines = []
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                lines.append(element.get_text())
        page_text = '\n'.join(lines)
        pages.append(page_text)
    return pages

Filter out noise: Cover-page,resumes,declaration, acknowledgements, table of contents,empty lines and References that may have names and locations that will hinder the location extraction. Left with main body

In [7]:
def is_toc_page(text):
    if "table of contents" in text or "contents" in text:
        return True
    if re.search(r'\.{5,}', text) and re.search(r'\d{1,3}\s*$', text, re.MULTILINE):
        return True
    if sum(1 for l in text.split('\n') if re.match(r'.*\d{1,3}\s*$', l)) > 5:
        return True
    return False

def is_ack_page(text):
    return "acknowledgement" in text or "acknowledgments" in text

def is_declaration_page(text):
    return "declaration" in text

def is_main_section_start(text):
    return bool(re.search(
        r'\b(?:1\.|chapter\s*1)[:\s-]*introduction\b|\bintroduction\b',
        text, re.IGNORECASE
    ))

def remove_empty_lines(text):
    return "\n".join(line for line in text.splitlines() if line.strip())

In [9]:
def remove_references_sections(page_texts):
    """
    Removes 'References' sections from each page and drops any blank pages.
    Input: list of page_texts (strings)
    Output: list of cleaned page_texts (strings)
    """
    cleaned_pages = []
    skip_mode = False
    for page in page_texts:
        lines = page.splitlines()
        cleaned_lines = []
        for line in lines:
            # Detect references header
            if not skip_mode and re.match(r'^\s*(\d+\.?)?\s*references\b', line, re.I):
                skip_mode = True
                continue
            # Exit skip mode if a new section/chapter starts
            if skip_mode and (
                re.match(r'^\s*(chapter|paper|section|abstract|introduction)\b', line, re.I) or
                re.match(r'^\s*(\d+\.?)?\s*(abstract|introduction|chapter|paper|section)\b', line, re.I)
            ):
                skip_mode = False
            if not skip_mode:
                cleaned_lines.append(line)
        # Remove empty lines
        non_empty = [l for l in cleaned_lines if l.strip()]
        # If after cleaning, page is not blank, keep it
        if non_empty:
            cleaned_pages.append('\n'.join(non_empty))
    return cleaned_pages

PDF to text with page mapping that will help with lookup later. Notably only the main body is converted to text. Coverpage is set to page 1 like what is viewed when pdf is read on a reading application e.g. Adobe Acrobat

In [12]:
def pdf_to_text_with_page_mapping(pdf_path):
    pages = extract_pages_text(pdf_path)
    body_pages = pages[1:]  # Removes cover page
    filtered_pages = []
    kept_pages = []
    skip_mode = None
    original_page_numbers = list(range(2, len(pages)+1))

    for idx, pg in enumerate(body_pages):
        pg_lower = pg.lower()
        if skip_mode == 'toc':
            if is_main_section_start(pg_lower):
                skip_mode = None
            elif is_toc_page(pg_lower):
                continue
            else:
                skip_mode = None
        elif skip_mode == 'ack':
            if is_main_section_start(pg_lower):
                skip_mode = None
            elif is_ack_page(pg_lower):
                continue
            else:
                skip_mode = None
        elif skip_mode == 'dec':
            if is_main_section_start(pg_lower):
                skip_mode = None
            elif is_declaration_page(pg_lower):
                continue
            else:
                skip_mode = None
        if skip_mode is None:
            if is_toc_page(pg_lower):
                skip_mode = 'toc'
                continue
            elif is_ack_page(pg_lower):
                skip_mode = 'ack'
                continue
            elif is_declaration_page(pg_lower):
                skip_mode = 'dec'
                continue
        filtered_pages.append(pg)
        kept_pages.append(original_page_numbers[idx])

    main_body_text = "\n\n".join(remove_empty_lines(pg) for pg in filtered_pages)

    return main_body_text, kept_pages, filtered_pages

Next to be done on multiple PDF academic theses. The output is a folder of text files to be used as input in location extraction. A page count report can be generated to check that the extraction has actually extracted main body and not removed relevant parts that are important especially since these are unstructured pdfs that may have editing that may cause issues with extraction pipeline.

In [15]:
def process_pdf_folder(folder_path, output_txt_folder=None, csv_report_path=None):
    """
    Processes all PDFs in the given folder:
    - Extracts text (and keeps track of page numbers).
    - Removes all content from the References section onwards.
    - Writes cleaned text to .txt files (with page markers).
    - Prints and optionally saves a count per file as CSV.
    """
    if output_txt_folder:
        os.makedirs(output_txt_folder, exist_ok=True)
    report = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            print(f"Processing: {filename}")
            try:
                main_text, kept_page_nums, kept_page_texts = pdf_to_text_with_page_mapping(pdf_path)
                kept_page_texts_norefs = remove_references_sections(kept_page_texts)
                kept_page_nums_norefs = kept_page_nums[:len(kept_page_texts_norefs)]

                if output_txt_folder:
                    txt_filename = os.path.splitext(filename)[0] + ".txt"
                    txt_path = os.path.join(output_txt_folder, txt_filename)
                    with open(txt_path, "w", encoding="utf-8") as f:
                        for page_num, page_text in zip(kept_page_nums_norefs, kept_page_texts_norefs):
                            f.write(f"\n--- Page {page_num} ---\n")
                            f.write(remove_empty_lines(page_text).strip() + "\n")
                report.append({"filename": filename, "kept_pages": len(kept_page_nums_norefs)})
            except Exception as e:
                print(f"Failed to process {filename}: {e}")
    print("\n=== Page Count Report ===")
    for row in report:
        print(f"{row['filename']}: {row['kept_pages']} pages kept")
    if csv_report_path:
        pd.DataFrame(report).to_csv(csv_report_path, index=False)
    return report


In [19]:
if __name__ == "__main__":
    folder = "DATA_FOR_MODELS/data_pdf"
    out_folder = "DATA_FOR_MODELS/Data_txt"
    report_csv = "DATA_FOR_MODELS/pdf_page_report.csv"
    process_pdf_folder(folder, out_folder, report_csv)

Processing: 2007_Tshibubudze_THE MARKOYE FAULT_2007.pdf
Processing: 2008_MATABANE_FE3.pdf
Processing: 2009_Bontle Nkuna_0605886P_Honours Report.pdf
Processing: 2010_Matsheka_Irvin Final Thesis.pdf
Processing: 2010_Mohale_GIS interpretation of NE Burkina Faso.pdf
Processing: 2011_Peters_East Markoye_2011.pdf
Processing: 2011_Woolfe_The stratigraphy and metamorphic facies of the KEMB.pdf
Processing: 2012_Simoko_Petrology, geochemistry and structure of the Pissila batholith and the Saaba Zone gneiss.pdf
Processing: 2013_FUNYUFUNYU.pdf
Processing: 2013_Peters.pdf
Processing: 2013_Ramabulana_Sadiola Hill petrology.pdf
Processing: 2014_MSc_YOSSI.pdf
Processing: 2015_LeBrun_Siguiri.pdf
Processing: 2015_Masurel_phd.pdf

=== Page Count Report ===
2007_Tshibubudze_THE MARKOYE FAULT_2007.pdf: 49 pages kept
2008_MATABANE_FE3.pdf: 39 pages kept
2009_Bontle Nkuna_0605886P_Honours Report.pdf: 45 pages kept
2010_Matsheka_Irvin Final Thesis.pdf: 28 pages kept
2010_Mohale_GIS interpretation of NE Burkin