In [1]:
import fitz  # PyMuPDF
import re
import os
import csv
import unicodedata

In [2]:
# Your folder paths
input_folder = "pdfs"
output_folder = "csvs"

In [3]:
# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [4]:
# Step 2: Remove non-ASCII characters
def remove_non_ascii(text):
    text = unicodedata.normalize("NFKD", text)
    text = text.replace('"', '')  # remove standard quotes
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

In [5]:
# Step 3: Smart newlines before capitalized words
def insert_newlines_before_caps(text):
    # Start new line before capitalized words (but not if preceded by space, caps, slash, hyphen)
    return re.sub(r'(?<![\sA-Z\-/()])([A-Z][a-z])', r'\n\1', text)

In [6]:
# Step 4: General newlines before abbreviations like NGO, U.S., AI, etc.
def insert_newlines_before_abbreviations(text):
    # All-caps abbreviations like NGOs, HTML, UN
    text = re.sub(r'(?<![\sA-Z])([A-Z]{2,}(s?)\.?)', r'\n\1', text)

    # Dotted abbreviations like U.S., U.S.A., E.U.
    text = re.sub(r'(?<![\sA-Z])((?:[A-Z]\.){2,}[A-Z]?\.?)', r'\n\1', text)

    return text

In [7]:
ACRONYMS = ["FBI", "CIA", "EPA", "NOAA", "NASA", "DOE"]

In [8]:
def insert_newlines_between_known_acronyms(text, acronyms):
    # Sort to match longer acronyms first
    acronyms = sorted(acronyms, key=len, reverse=True)

    for a1 in acronyms:
        # First, try to split a1 + a2
        for a2 in acronyms:
            if a1 != a2:
                combined = a1 + a2
                if combined in text:
                    text = text.replace(combined, f"{a1}\n{a2}\n")

        # If no match found, still try to insert newline after a1 if it's glued to something
        pattern = re.compile(rf'({a1})(?=[A-Z])')  # a1 followed by a capital letter
        text = pattern.sub(r'\1\n', text)

    return text

In [9]:
def insert_newlines_at_sentence_starts(text):
    # Match a period (or ! or ?) followed by space and a capital letter
    # Use a positive lookbehind to ensure sentence-ending punctuation
    return re.sub(r'(\s)([.!?]) ([A-Z])', r'\1\2\n\3', text)


In [10]:
def split_sentences_on_long_lines(text, threshold=100):
    lines = text.splitlines()
    result = []

    for line in lines:
        if len(line.strip()) > threshold:
            # Apply sentence splitting only on long lines
            split_line = re.sub(r'([.!?]) ([A-Z])', r'\1\n\2', line)
            result.extend(split_line.splitlines())
        else:
            result.append(line)

    return "\n".join(result)

In [11]:
def restore_exceptions(text):
    # Rejoin "Ameri\nCorps" if it was split
    text = text.replace("Bio\nTech", "BioTech")
    text = text.replace("Eco\nTourism", "EcoTourism")
    text = text.replace("Peace\nCorps", "PeaceCorps")
    text = text.replace("Peace Corps", "PeaceCorps")
    return text.replace("Ameri\nCorps", "AmeriCorps")

In [12]:
def trim_blank_and_outer_spaces(text):
    return "\n".join(
        line.strip() for line in text.splitlines() if line.strip() != "" 
    )

In [13]:
def apply_trim_rules(text, rules):
    lines = text.splitlines()
    result = []

    i = 0
    while i < len(lines):
        line = lines[i]

        # Check for remove_after marker
        if any(rule["type"] == "remove_after" and rule["marker"] in line for rule in rules):
            break

        # Skip lines that match any rule
        skip = False
        for rule in rules:
            if rule["type"] == "remove_if_contains" and rule["text"] in line:
                skip = True
                break
            elif rule["type"] == "remove_if_equals" and line.strip() == rule["text"]:
                skip = True
                break
            elif rule["type"] == "remove_if_regex_matches":
                if re.search(rule["pattern"], line):
                    skip = True
                    break
            elif rule["type"] == "remove_if_length_exceeds":
                if len(line.strip()) > rule["limit"]:
                    skip = True
                    break

        if not skip:
            result.append(line)

        i += 1

    return "\n".join(result)

In [14]:
# Define your trimming rules
trim_rules = [
    {"type": "remove_after", "marker": "Marketable Skills"},
    {"type": "remove_after", "marker": "Academic and Career Advising Center Services"},
    {"type": "remove_if_contains", "text": "Career Guide:"},
    {"type": "remove_if_contains", "text": "Possible Employers"},
    {"type": "remove_if_contains", "text": "The following list"},
    {"type": "remove_if_equals", "text": "Related Job Titles"},
    {"type": "remove_if_regex_matches", "pattern": r"Private.*:"},
    {"type": "remove_if_contains", "text": "State, County & Local Agencies:"},
    {"type": "remove_if_contains", "text": "Federal/National Programs"},
    {"type": "remove_if_contains", "text": "Federal Agencies:"}, 
    {"type": "remove_if_length_exceeds", "limit": 100}
]

In [15]:
def process_all_pdfs(input_folder, output_folder, trim_rules):
    for filename in os.listdir(input_folder):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(input_folder, filename)
            base_name = os.path.splitext(filename)[0]
            output_path = os.path.join(output_folder, base_name + ".csv")

            # Full pipeline
            raw_text = extract_text_from_pdf(pdf_path)
            ascii_text = remove_non_ascii(raw_text)
            with_sentence_newlines = insert_newlines_at_sentence_starts(ascii_text)
            with_caps_on_newlines = insert_newlines_before_caps(with_sentence_newlines)
            with_abbrs_on_newlines = insert_newlines_before_abbreviations(with_caps_on_newlines)
            with_acronyms_split = insert_newlines_between_known_acronyms(with_abbrs_on_newlines, ACRONYMS)
            with_exceptions_restored = restore_exceptions(with_acronyms_split)
            with_sentence_splits = split_sentences_on_long_lines(with_exceptions_restored)
            fully_trimmed = trim_blank_and_outer_spaces(with_sentence_splits)
            final_text = apply_trim_rules(fully_trimmed, trim_rules)

            # Write each cleaned line as a row in a CSV
            with open(output_path, "w", newline='', encoding="utf-8") as f:
                writer = csv.writer(f)
                for line in final_text.splitlines():
                    writer.writerow([line])

            print(f"Saved CSV to: {output_path}")

In [16]:
# Call the loop
process_all_pdfs(input_folder, output_folder, trim_rules)

Saved CSV to: csvs/Business%20Admin_0.csv
Saved CSV to: csvs/World%20Languages%20%26%20Cultures.csv
Saved CSV to: csvs/Eco%20Resto.csv
Saved CSV to: csvs/Environmental%20Science_0.csv
Saved CSV to: csvs/Oceanography_0.csv
Saved CSV to: csvs/Psychology%202013.csv
Saved CSV to: csvs/Physics%20%26%20Astronomy_0.csv
Saved CSV to: csvs/Religious%20Studies.csv
Saved CSV to: csvs/NR%20Recreation.csv
Saved CSV to: csvs/Forestry_0.csv
Saved CSV to: csvs/Communication_0.csv
Saved CSV to: csvs/Environmental%20Management%20and%20Protection_0.csv
Saved CSV to: csvs/Rangeland%20Resources_0.csv
Saved CSV to: csvs/Music.csv
Saved CSV to: csvs/Fisheries_0.csv
Saved CSV to: csvs/Education%20%26%20Interpretation.csv
Saved CSV to: csvs/Dance%20Studies.csv
Saved CSV to: csvs/Philosophy.csv
Saved CSV to: csvs/Environmental%20Studies.csv
Saved CSV to: csvs/History.csv
Saved CSV to: csvs/Geospatial%20Science.csv
Saved CSV to: csvs/Anthropology.csv
Saved CSV to: csvs/Theatre%20Arts.csv
Saved CSV to: csvs/Compu