In [1]:
from pdf_handler import PDFHandler

import fitz
import pandas as pd
import numpy as np
import os
import sqlite3
import time


In [2]:
def find_all_paths_and_names(folder_path):
    all_paths = []
    all_names = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            all_paths.append(os.path.join(root, file))
            all_names.append(file)
    return all_paths, all_names

In [3]:
# Change the path to your folder with PDF files
all_file_paths, all_file_names = find_all_paths_and_names("/home/PUC/Documentos/AutoSLR/papers_pdf")

In [None]:
#1) generic regex for all pdfs
db_name = "generic_regex.db"
conn = sqlite3.connect(db_name)
cursor = conn.cursor()

PDFHandler.create_tables(conn)
start = time.time()
for path, name in zip(all_file_paths, all_file_names):
    
    
    doc = PDFHandler.try_open(path)
    if doc is None:
        print(f"Error opening the PDF: {name}")
        continue

    pdf_name = os.path.basename(path)
    
    cursor.execute("SELECT id FROM pdfs WHERE pdf_name = ?", (pdf_name,))
    pdf_id = cursor.fetchone()
        
    if not pdf_id:
        cursor.execute("INSERT INTO pdfs (pdf_name) VALUES (?)", (pdf_name,))
        pdf_id = cursor.lastrowid

    text, page_count, size_mode = PDFHandler.simple_extraction(doc)
    text = PDFHandler.default_pdf_cleaning(text)
    try:
        sections = PDFHandler.find_pattern_in_text(text, PDFHandler.regex_patterns["generic_section_title"])

        PDFHandler.insert_section_into_sqlite(conn, sections, pdf_id)

    except Exception as e:
        print(f"Error processing {name}: {e}")

In [None]:
#2) tagged regex extraction for all pdfs
db_name = "tagged_regex.db"
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
PDFHandler.create_tables(conn)
for path, name in zip(all_file_paths, all_file_names):
    
    doc = PDFHandler.try_open(path)
    if doc is None:
        # print(f"Error opening the PDF: {name}")
        continue

    pdf_name = os.path.basename(path)
    
    cursor.execute("SELECT id FROM pdfs WHERE pdf_name = ?", (pdf_name,))
    pdf_id = cursor.fetchone()
        
    if not pdf_id:
        cursor.execute("INSERT INTO pdfs (pdf_name) VALUES (?)", (pdf_name,))
        pdf_id = cursor.lastrowid
    
    text, page_count, size_mode = PDFHandler.tagged_text_extraction(doc)
    text = PDFHandler.default_pdf_cleaning(text)

    try:
        sections = PDFHandler.find_pattern_in_text(text, PDFHandler.regex_patterns["generic_section_title"])
        sections = [section for section in sections if getattr(section, 'is_bold', False) or getattr(section, 'size', 0) >= size_mode]

        PDFHandler.insert_section_into_sqlite(conn, sections, pdf_id)

    except Exception as e:
        print(f"Error processing {name}: {e}")

In [6]:
#3) specific regex extraction for all pdfs
db_name = "specific_regex.db"
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
PDFHandler.create_tables(conn)
df = pd.read_csv("regex_validation.csv")
for path, name in zip(all_file_paths, all_file_names):
    doc = PDFHandler.try_open(path)
    if doc is None:
        print(f"Error opening the PDF: {name}")
        continue

    pdf_name = os.path.basename(path)
    
    cursor.execute("SELECT id FROM pdfs WHERE pdf_name = ?", (pdf_name,))
    pdf_id = cursor.fetchone()
        
    if not pdf_id:
        cursor.execute("INSERT INTO pdfs (pdf_name) VALUES (?)", (pdf_name,))
        pdf_id = cursor.lastrowid
    
    text, page_count = PDFHandler.simple_extraction(doc)
    text = PDFHandler.default_pdf_cleaning(text)

    try:
        section_type = df[df['pdf_name'] == name[:-4]]['section_type'].values[0]
        # print(f"Processing {name} with section type: {section_type}")
        if section_type not in PDFHandler.regex_patterns:
            # print(f"Section type {section_type} not found in regex patterns.")
            continue
        sections = PDFHandler.find_pattern_in_text(text, PDFHandler.regex_patterns[section_type],  debug=True)

        PDFHandler.insert_section_into_sqlite(conn, sections, pdf_id)

    except Exception as e:
        print(f"Error processing {name}: {e}")

Found 10 matches:
Found 1 matches:
Found 11 matches:
Found 6 matches:
Found 12 matches:
Found 6 matches:
Found 6 matches:
Found 27 matches:
Found 9 matches:
Found 20 matches:
Found 5 matches:
Found 9 matches:
Found 15 matches:
Found 7 matches:
Found 7 matches:
Found 7 matches:
Found 62 matches:
Found 22 matches:
Found 4 matches:
Found 6 matches:
Found 6 matches:
Found 8 matches:
Found 10 matches:
Found 45 matches:
Found 10 matches:
Found 6 matches:
Found 6 matches:
Found 10 matches:
Found 5 matches:
Found 10 matches:
Found 8 matches:
Found 9 matches:
Found 13 matches:
Found 10 matches:
Found 8 matches:
Found 8 matches:
Found 12 matches:
Found 10 matches:
Found 12 matches:
Found 9 matches:
Found 5 matches:
Found 22 matches:
Found 5 matches:
Found 4 matches:
Found 6 matches:
Found 8 matches:
Found 6 matches:
Found 55 matches:
Found 36 matches:
Found 19 matches:
Found 16 matches:
Found 14 matches:
Found 9 matches:
Found 12 matches:
Found 6 matches:
Found 3 matches:
Found 26 matches:
Found

In [None]:
#4) voting policy for all pdfs
db_name = "voting_policy.db"
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
PDFHandler.create_tables(conn)

for path, name in zip(all_file_paths, all_file_names):
    doc = PDFHandler.try_open(path)
    if doc is None:
        print(f"Error opening the PDF: {name}")
        continue

    pdf_name = os.path.basename(path)
    
    cursor.execute("SELECT id FROM pdfs WHERE pdf_name = ?", (pdf_name,))
    pdf_id = cursor.fetchone()
        
    if not pdf_id:
        cursor.execute("INSERT INTO pdfs (pdf_name) VALUES (?)", (pdf_name,))
        pdf_id = cursor.lastrowid
    
    text, page_count, size_mode = PDFHandler.tagged_text_extraction(doc)
    text = PDFHandler.default_pdf_cleaning(text)

    try:
        sections = PDFHandler.find_pattern_in_text(text, PDFHandler.regex_patterns["generic_section_title"])
        sections = [section for section in sections if getattr(section, 'is_bold', False) or getattr(section, 'size', 0) >= size_mode]

        PDFHandler.insert_section_into_sqlite(conn, sections, pdf_id)

    except Exception as e:
        print(f"Error processing {name}: {e}")