In [None]:
!pip install pdf2image
!pip install editdistance
!pip install pymupdf
!pip install tools




In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import fitz
import pdf2image
from pdf2image import convert_from_path
import editdistance
import pandas as pd
import time
from collections import defaultdict
import codecs
import re
import tabulate
from tabulate import tabulate
output = r'/content/drive/MyDrive/ECE496/ECE496-project/pymupdf'
os.makedirs(output, exist_ok=True)
pdfs = r"/content/drive/MyDrive/ECE496/ECE496-project/syllabus/tables/pdf"


def pdf_to_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ''
    for page in doc:
        text += page.get_text()
    return text

def write_files(dir):
    for file in os.listdir(dir):
        input_file = os.path.join(dir, file)
        if os.path.isdir(input_file):
            continue
        pdf = os.path.join(dir, file)
        text = pdf_to_text(pdf)
        # print("Done")
        output_file = os.path.join(output, f'{os.path.splitext(file)[0]}.txt')
        with open(output_file, 'w') as f:
            f.write(text)

write_files(pdfs)

def bag_of_characters_error_rate(output_text, ref_text):
    output_freq = defaultdict(int)
    ref_freq = defaultdict(int)

    for char in output_text:
        output_freq[char] += 1

    for char in ref_text:
        ref_freq[char] += 1

    total_chars = sum(ref_freq.values())
    # print(output_freq, ref_freq)
    # print(set(output_text + ref_text))
    incorrect_chars = 0
    for char in set(output_text + ref_text):
        incorrect_chars += abs(output_freq[char] - ref_freq[char])

    cer = incorrect_chars / total_chars
    return cer*100

def read_and_normalize(file_path):
    with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        lines = [normalize_text(line) for line in file.readlines() if line.strip()]
        return ' '.join(lines)

def normalize_text(text):
    for char in text:
      if char == '\n':
        char = " "
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    # text = text.replace(" ", "")
    return text.lower().strip()

def calculate_wer(output_text, ref_text):
    output_words = normalize_text(output_text).split()
    ref_words = normalize_text(ref_text).split()
    # Levenshtein distance (edit distance) for alignment
    distance = editdistance.eval(output_words, ref_words)
    # Compute WER
    wer = distance / len(ref_words) if len(ref_words) > 0 else 0.0
    return wer*100

def extract_number(filename):
    numbers = re.findall(r'\d+', filename)
    return int(numbers[0]) if numbers else float('inf')

In [None]:
import os
import pandas as pd

def extract_basename(file_name):
    return os.path.splitext(file_name)[0]  # Extract basename without extension

ref_folder = r"/content/drive/MyDrive/ECE496/ECE496-project/syllabus/tables/text_doc"
output_folder = r"/content/drive/MyDrive/ECE496/ECE496-project/pymupdf"

ref_files = os.listdir(ref_folder)
output_files = os.listdir(output_folder)

# Create dictionaries with basenames as keys for both reference and output files
ref_dict = {extract_basename(f): f for f in ref_files}
output_dict = {extract_basename(f): f for f in output_files}

# Initialize variables for scoring
table_headers = ["File", "Reference File", "Output File", "CER Score", "WER Score"]
table_data = []
total_cer = 0.0
total_wer = 0.0
num_pairs = 0
print(ref_dict)
print( output_dict)
# Calculate scores for each matching pair
for basename, ref_file in ref_dict.items():
    print(basename)
    if basename in output_dict:
        output_file = output_dict[basename]

        ref_path = os.path.join(ref_folder, ref_file)
        output_path = os.path.join(output_folder, output_file)

        # Read and normalize reference and output texts
        ref_text = read_and_normalize(ref_path)
        output_text = read_and_normalize(output_path)

        # Calculate CER score
        cer_score = bag_of_characters_error_rate(output_text, ref_text)

        # Calculate WER score
        wer_score = calculate_wer(output_text, ref_text)

        # Accumulate scores and count pairs
        total_cer += cer_score
        total_wer += wer_score
        num_pairs += 1

        # Append to table data
        table_data.append([basename, ref_text, output_text, cer_score, wer_score])

# Create DataFrame for results
df = pd.DataFrame(table_data, columns=table_headers)

# Save to Excel
excel_file = r"/content/drive/MyDrive/ECE496/ECE496-project/pymupdf_scores.xlsx"
df.to_excel(excel_file, sheet_name='Scores', index=False)

print(f"Results saved to Excel file: {excel_file}")

# Calculate overall scores
if num_pairs > 0:
    overall_cer = total_cer / num_pairs
    overall_wer = total_wer / num_pairs
else:
    overall_cer = 0.0
    overall_wer = 0.0

print(f"\nOverall CER and WER for {num_pairs} pairs: {overall_cer:.4f}, {overall_wer:.4f}")


{'1-syllabus': '1-syllabus.txt', 'CIVE-ENVE-4918': 'CIVE-ENVE-4918.txt', 'APS111': 'APS111.txt', 'APS112': 'APS112.txt', 'ASU_human_system_capstone': 'ASU_human_system_capstone.txt'}
{'1-syllabus': '1-syllabus.txt', 'ASU_human_system_capstone': 'ASU_human_system_capstone.txt', 'APS111': 'APS111.txt', 'APS112': 'APS112.txt', 'CIVE-ENVE-4918': 'CIVE-ENVE-4918.txt'}
1-syllabus
CIVE-ENVE-4918
APS111
APS112
ASU_human_system_capstone
Results saved to Excel file: /content/drive/MyDrive/ECE496/ECE496-project/pymupdf_scores.xlsx

Overall CER and WER for 5 pairs: 0.1815, 0.1787
