In [None]:
import csv
import re
from paddleocr import PaddleOCR
import fitz  # PyMuPDF
import numpy as np
from PIL import Image
from tensorflow.python.framework.test_util import use_gpu

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang="en",use_gpu=True)

# Define the path and page range
pdf_path = r"Primer.pdf"
PAGE_NUM = 846  # Adjust for actual page range

# Define file paths for CSV and text output
csv_path = r"extracted_exercises.csv"
text_path = r"extracted_content.txt"

# Create a CSV file to store the exercises dataset
with open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file, \
     open(text_path, mode='a', encoding='utf-8') as text_file:  # Open text file in append mode

    writer = csv.writer(csv_file)
    writer.writerow(["Exercise Number", "Question Text"])

    # Regular expression to detect exercise questions (e.g., "Exercise 2.1")
    exercise_pattern = re.compile(r"^(Exercise\s+\d+\.\d+):?\s*(.*)")

    # Process each page for OCR
    with fitz.open(pdf_path) as pdf:
        for pg_num in range(30, PAGE_NUM):
            page = pdf[pg_num]
            
            # Increase DPI for rendering to improve OCR accuracy (e.g., 300 DPI)
            mat = fitz.Matrix(2.0, 2.0)  # 3x scaling for higher resolution
            pm = page.get_pixmap(matrix=mat, alpha=False)
            
            # Save the image to inspect if OCR fails
            img_path = rf"C:\Users\aggar\PYQAnalyser\page_{pg_num + 1}.png"
            # pm.save(img_path)  # Save as PNG for inspection
            
            # Convert the pixmap to a numpy array
            img = np.array(Image.frombytes("RGB", [pm.width, pm.height], pm.samples))
            
            # OCR on the image
            result = ocr.ocr(img, cls=True)
            print(f"[DEBUG] OCR result for page {pg_num + 1}:", result)  # Debug line
            if not result:
                print(f"[DEBUG] No text found on page {pg_num + 1}")
                continue

            # Append page header to the text file
            text_file.write(f"Page {pg_num + 1}:\n")

            # Process OCR output
            for line in result:
                for entry in line:
                    text = entry[1][0].strip()  # Extract recognized text and strip whitespace
                    text_file.write(text + "\n")  # Append entire OCR text to the text file
                    print(f"[DEBUG] Text added to txt file: {text}")  # Debug line

                    # Check if text matches the exercise pattern
                    match = exercise_pattern.match(text)
                    if match:
                        exercise_num = match.group(1)  # Extract "Exercise 2.1"
                        question_text = match.group(2)  # Extract the question text
                        writer.writerow([exercise_num, question_text])  # Write to CSV
                        print(f"[DEBUG] Writing exercise to CSV: {exercise_num} - {question_text}")  # Debug line
            
            text_file.write("\n")  # Add newline between pages in the text file

print("Exercises dataset created successfully at:", csv_path)
print("Full text content saved successfully at:", text_path)

[2024/11/12 13:29:02] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\rathe/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\rathe/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6

In [2]:
import paddle
gpu_available  = paddle.device.is_compiled_with_cuda()
name = paddle.device.get_device()
print("GPU available:", gpu_available)
print(name)

GPU available: True
gpu:0


In [None]:
import csv
import re
from paddleocr import PaddleOCR
import fitz  # PyMuPDF
import numpy as np
from PIL import Image
from tensorflow.python.framework.test_util import use_gpu

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang="en",use_gpu=True)

# Define the path and page range
pdf_path = r"Primer.pdf"
PAGE_NUM = 846  # Adjust for actual page range

# Define file paths for CSV and text output
csv_path = r"extracted_exercises.csv"
text_path = r"extracted_content.txt"

# Open CSV and text files
with open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file, \
     open(text_path, mode='a', encoding='utf-8') as text_file:  # Open text file in append mode

    writer = csv.writer(csv_file)
    writer.writerow(["Exercise Number", "Question Text"])

    # Regular expression to detect exercise questions (e.g., "Exercise 2.1")
    exercise_pattern = re.compile(r"^(Exercise\s+\d+\.\d+):?\s*(.*)")

    # Process each page for OCR
    with fitz.open(pdf_path) as pdf:
        for pg_num in range(30, PAGE_NUM):
            page = pdf[pg_num]
            
            # Increase DPI for rendering to improve OCR accuracy (e.g., 300 DPI)
            mat = fitz.Matrix(2.0, 2.0)  # 2x scaling for higher resolution
            pm = page.get_pixmap(matrix=mat, alpha=False)
            
            # Convert the pixmap to a numpy array
            img = np.array(Image.frombytes("RGB", [pm.width, pm.height], pm.samples))
            
            # OCR on the image with error handling
            try:
                result = ocr.ocr(img, cls=True)
                if not result:
                    print(f"[DEBUG] No text found on page {pg_num + 1}")
                    continue
                print(f"[DEBUG] OCR result for page {pg_num + 1}:", result)
            except Exception as e:
                print(f"[ERROR] OCR failed on page {pg_num + 1}: {e}")
                continue  # Skip to the next page if OCR fails

            # Append page header to the text file
            text_file.write(f"Page {pg_num + 1}:\n")

            # Process OCR output
            for line in result:
                for entry in line:
                    text = entry[1][0].strip()  # Extract recognized text and strip whitespace
                    text_file.write(text + "\n")  # Append entire OCR text to the text file
                    text_file.flush()  # Ensure data is written immediately
                    print(f"[DEBUG] Text added to txt file: {text}")  # Debug line

                    # Check if text matches the exercise pattern
                    match = exercise_pattern.match(text)
                    if match:
                        exercise_num = match.group(1)  # Extract "Exercise 2.1"
                        question_text = match.group(2)  # Extract the question text
                        writer.writerow([exercise_num, question_text])  # Write to CSV
                        csv_file.flush()  # Ensure data is written immediately
                        print(f"[DEBUG] Writing exercise to CSV: {exercise_num} - {question_text}")  # Debug line
            
            text_file.write("\n")  # Add newline between pages in the text file
            text_file.flush()

print("Exercises dataset created successfully at:", csv_path)
print("Full text content saved successfully at:", text_path)


[2024/11/12 13:48:22] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\rathe/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\rathe/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6