In [5]:
import os
import fitz  
import editdistance
from collections import defaultdict

def pdf_to_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ''
    for page in doc:
        text += page.get_text()
    return text

def save_text_file(text, txt_path):
    with open(txt_path, 'w') as f:
        f.write(text)

def normalize_text(text):
    text = text.replace('\n', ' ')
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    return text.lower().strip()

def bag_of_characters_error_rate(output_text, ref_text):
    output_freq = defaultdict(int)
    ref_freq = defaultdict(int)

    for char in output_text:
        output_freq[char] += 1

    for char in ref_text:
        ref_freq[char] += 1

    total_chars = sum(ref_freq.values())
    incorrect_chars = 0
    for char in set(output_text + ref_text):
        incorrect_chars += abs(output_freq[char] - ref_freq[char])

    cer = incorrect_chars / total_chars if total_chars > 0 else 0.0
    return cer * 100

def calculate_wer(output_text, ref_text):
    output_words = normalize_text(output_text).split()
    ref_words = normalize_text(ref_text).split()
    
    # Levenshtein distance (edit distance) for alignment
    distance = editdistance.eval(output_words, ref_words)

    # Compute WER
    wer = distance / len(ref_words) if len(ref_words) > 0 else 0.0
    return wer * 100

def main(pdf_directory, txt_directory, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    total_chars = 0
    total_incorrect_chars = 0
    total_ref_words = 0
    total_distance = 0
    results = []

    for filename in os.listdir(pdf_directory):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(pdf_directory, filename)
            extracted_text = pdf_to_text(pdf_path)

            # Save the extracted text to a .txt file
            txt_filename = os.path.splitext(filename)[0] + '.txt'
            txt_path = os.path.join(output_directory, txt_filename)
            save_text_file(extracted_text, txt_path)

            # Compare with the reference .txt file
            ref_txt_filename = os.path.splitext(filename)[0] + '.txt'
            ref_txt_path = os.path.join(txt_directory, ref_txt_filename)

            if os.path.exists(ref_txt_path):
                with open(ref_txt_path, 'r') as f:
                    reference_text = f.read()

                cer = bag_of_characters_error_rate(extracted_text, reference_text)
                wer = calculate_wer(extracted_text, reference_text)
                
                output_chars = len(extracted_text)
                ref_chars = len(reference_text)
                total_chars += ref_chars
                total_incorrect_chars += (ref_chars * cer / 100)  # approximate incorrect chars
                total_ref_words += len(reference_text.split())
                total_distance += editdistance.eval(normalize_text(extracted_text).split(), normalize_text(reference_text).split())
                
                results.append((filename, cer, wer))
                
                print(f'{filename}: CER = {cer:.2f}%, WER = {wer:.2f}%')
            else:
                print(f'Text file {ref_txt_filename} not found for {filename}')

    overall_cer = (total_incorrect_chars / total_chars * 100) if total_chars > 0 else 0.0
    overall_wer = (total_distance / total_ref_words * 100) if total_ref_words > 0 else 0.0

    # Save overall results to a summary file
    summary_path = os.path.join(output_directory, 'summary.txt')
    with open(summary_path, 'w') as f:
        for filename, cer, wer in results:
            f.write(f'{filename}: CER = {cer:.2f}%, WER = {wer:.2f}%\n')
        f.write(f'Overall CER = {overall_cer:.2f}%\n')
        f.write(f'Overall WER = {overall_wer:.2f}%\n')

    print(f'Overall CER = {overall_cer:.2f}%, Overall WER = {overall_wer:.2f}%')

if __name__ == '__main__':
    pdf_directory = '/Users/Laura/Desktop/ECE496-project-2/syllabus/tables/smallset'
    txt_directory = '/Users/Laura/Desktop/ECE496-project-2/syllabus/tables/text_doc'
    output_directory = '/Users/Laura/Desktop/ECE496-project-2/pymupdf_table'

    main(pdf_directory, txt_directory, output_directory)


APS112.pdf: CER = 1.96%, WER = 0.18%
APS111.pdf: CER = 2.03%, WER = 0.17%
CIVE-ENVE-4918.pdf: CER = 3.53%, WER = 0.16%
1-syllabus.pdf: CER = 1.05%, WER = 0.27%
Overall CER = 2.22%, Overall WER = 0.19%


In [6]:
import os
import fitz  
import editdistance
from collections import defaultdict

def pdf_to_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ''
    for page in doc:
        text += page.get_text()
    return text

def save_text_file(text, txt_path):
    with open(txt_path, 'w') as f:
        f.write(text)

def normalize_text(text):
    text = text.replace('\n', ' ')
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    return text.lower().strip()

def bag_of_characters_error_rate(output_text, ref_text):
    output_freq = defaultdict(int)
    ref_freq = defaultdict(int)

    for char in output_text:
        output_freq[char] += 1

    for char in ref_text:
        ref_freq[char] += 1

    total_chars = sum(ref_freq.values())
    incorrect_chars = 0
    for char in set(output_text + ref_text):
        incorrect_chars += abs(output_freq[char] - ref_freq[char])

    cer = incorrect_chars / total_chars if total_chars > 0 else 0.0
    return cer * 100

def calculate_wer(output_text, ref_text):
    output_words = normalize_text(output_text).split()
    ref_words = normalize_text(ref_text).split()
    distance = editdistance.eval(output_words, ref_words)
    wer = distance / len(ref_words) if len(ref_words) > 0 else 0.0
    return wer * 100

def main(pdf_directory, txt_directory, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    total_chars = 0
    total_incorrect_chars = 0
    total_ref_words = 0
    total_distance = 0
    results = []

    for filename in os.listdir(pdf_directory):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(pdf_directory, filename)
            extracted_text = pdf_to_text(pdf_path)

            # Save the extracted text to a .txt file
            txt_filename = os.path.splitext(filename)[0] + '.txt'
            txt_path = os.path.join(output_directory, txt_filename)
            save_text_file(extracted_text, txt_path)

            # Compare with the reference .txt file
            ref_txt_filename = os.path.splitext(filename)[0] + '.txt'
            ref_txt_path = os.path.join(txt_directory, ref_txt_filename)

            if os.path.exists(ref_txt_path):
                with open(ref_txt_path, 'r') as f:
                    reference_text = f.read()

                cer = bag_of_characters_error_rate(extracted_text, reference_text)
                wer = calculate_wer(extracted_text, reference_text)
                
                output_chars = len(extracted_text)
                ref_chars = len(reference_text)
                total_chars += ref_chars
                total_incorrect_chars += (ref_chars * cer / 100)  # approximate incorrect chars
                total_ref_words += len(reference_text.split())
                total_distance += editdistance.eval(normalize_text(extracted_text).split(), normalize_text(reference_text).split())
                
                results.append((filename, cer, wer))
                
                print(f' {filename}: CER = {cer:.2f}%, WER = {wer:.2f}%')
            else:
                print(f'Text file {ref_txt_filename} not found for {filename}')

    overall_cer = (total_incorrect_chars / total_chars * 100) if total_chars > 0 else 0.0
    overall_wer = (total_distance / total_ref_words * 100) if total_ref_words > 0 else 0.0

    # Save overall results to a summary file
    summary_path = os.path.join(output_directory, 'summary.txt')
    with open(summary_path, 'w') as f:
        for filename, cer, wer in results:
            f.write(f'{filename}: CER = {cer:.2f}%, WER = {wer:.2f}%\n')
        f.write(f'Overall CER = {overall_cer:.2f}%\n')
        f.write(f'Overall WER = {overall_wer:.2f}%\n')

    print(f'Overall CER = {overall_cer:.2f}%, Overall WER = {overall_wer:.2f}%')

if __name__ == '__main__':
    pdf_directory = '/Users/Laura/Desktop/ECE496-project-2/syllabus/plainpdf/pdf'
    txt_directory = '/Users/Laura/Desktop/ECE496-project-2/syllabus/plainpdf/text_doc'
    output_directory = '/Users/Laura/Desktop/ECE496-project-2/pymupdf_plain'

    main(pdf_directory, txt_directory, output_directory)


 Capstone_Syllabus_Summer_2.pdf: CER = 4.48%, WER = 0.00%
 MECH-45X-Syllabus-2019.pdf: CER = 3.10%, WER = 0.07%
 CS4704Proposal.pdf: CER = 6.37%, WER = 0.35%
 ASUCapstoneDesign.pdf: CER = 2.60%, WER = 7.29%
 CapstoneSyllabus2017.pdf: CER = 3.04%, WER = 0.00%
 TAMUT_Capstone.pdf: CER = 7.21%, WER = 0.21%
 ELT-495_syllabus_ELT-495.pdf: CER = 0.96%, WER = 11.92%
 CE438W.pdf: CER = 5.97%, WER = 0.00%
 syllabus-ce 495B.pdf: CER = 11.26%, WER = 0.00%
Overall CER = 4.07%, Overall WER = 3.27%
