In [1]:
from pdf_handler import PDFHandler

import fitz
import pandas as pd
import numpy as np
import os
import sqlite3
import time
import json
import requests
import re
import google.generativeai as genai
from docling.document_converter import DocumentConverter
from dotenv import load_dotenv


  backends.update(_get_backends("networkx.backends"))
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def find_all_paths_and_names(folder_path):
    all_paths = []
    all_names = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            all_paths.append(os.path.join(root, file))
            all_names.append(file)
    return all_paths, all_names

In [3]:
# Change the path to your folder with PDF files
all_file_paths, all_file_names = find_all_paths_and_names("/home/pramos/Documents/AutoSLR/papers_pdf")

In [4]:
df = pd.read_csv("regex_validation.csv")

In [4]:
#1) generic regex for all pdfs
db_name = "results/generic_regex.db"
conn = sqlite3.connect(db_name)
cursor = conn.cursor()

PDFHandler.create_tables(conn)
start = time.time()
for path, name in zip(all_file_paths, all_file_names):
    
    
    doc = PDFHandler.try_open(path)
    if doc is None:
        print(f"Error opening the PDF: {name}")
        continue

    pdf_name = os.path.basename(path)
    
    cursor.execute("SELECT id FROM pdfs WHERE pdf_name = ?", (pdf_name,))
    pdf_id = cursor.fetchone()
        
    if not pdf_id:
        cursor.execute("INSERT INTO pdfs (pdf_name) VALUES (?)", (pdf_name,))
        pdf_id = cursor.lastrowid

    text, page_count = PDFHandler.simple_extraction(doc)
    text = PDFHandler.default_pdf_cleaning(text)
    try:
        sections = PDFHandler.find_pattern_in_text(text, PDFHandler.regex_patterns["generic_section_title"])

        PDFHandler.insert_section_into_sqlite(conn, sections, pdf_id)

    except Exception as e:
      print(f"Error processing {name}: {e}")

In [None]:
#2) tagged regex extraction for all pdfs
db_name = "results/tagged_regex.db"
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
PDFHandler.create_tables(conn)
for path, name in zip(all_file_paths, all_file_names):
    
    doc = PDFHandler.try_open(path)
    if doc is None:
        # print(f"Error opening the PDF: {name}")
        continue

    pdf_name = os.path.basename(path)
    
    cursor.execute("SELECT id FROM pdfs WHERE pdf_name = ?", (pdf_name,))
    pdf_id = cursor.fetchone()
        
    if not pdf_id:
        cursor.execute("INSERT INTO pdfs (pdf_name) VALUES (?)", (pdf_name,))
        pdf_id = cursor.lastrowid
    
    text, page_count, size_mode = PDFHandler.tagged_text_extraction(doc)
    text = PDFHandler.default_pdf_cleaning(text)

    try:
        sections = PDFHandler.find_pattern_in_text(text, PDFHandler.regex_patterns["generic_section_title"])
        sections = [section for section in sections if getattr(section, 'is_bold', False) or getattr(section, 'size', 0) >= size_mode]

        PDFHandler.insert_section_into_sqlite(conn, sections, pdf_id)

    except Exception as e:
        print(f"Error processing {name}: {e}")

In [None]:
#3) specific regex extraction for all pdfs
db_name = "results/specific_regex.db"
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
PDFHandler.create_tables(conn)

for path, name in zip(all_file_paths, all_file_names):
    doc = PDFHandler.try_open(path)
    if doc is None:
        print(f"Error opening the PDF: {name}")
        continue

    pdf_name = os.path.basename(path)
    
    cursor.execute("SELECT id FROM pdfs WHERE pdf_name = ?", (pdf_name,))
    pdf_id = cursor.fetchone()
        
    if not pdf_id:
        cursor.execute("INSERT INTO pdfs (pdf_name) VALUES (?)", (pdf_name,))
        pdf_id = cursor.lastrowid
    
    text, page_count = PDFHandler.simple_extraction(doc)
    text = PDFHandler.default_pdf_cleaning(text)

    try:
        section_type = df[df['pdf_name'] == name[:-4]]['section_type'].values[0]
        # print(f"Processing {name} with section type: {section_type}")
        if section_type not in PDFHandler.regex_patterns:
            # print(f"Section type {section_type} not found in regex patterns.")
            continue
        sections = PDFHandler.find_pattern_in_text(text, PDFHandler.regex_patterns[section_type],  debug=True)

        PDFHandler.insert_section_into_sqlite(conn, sections, pdf_id)

    except Exception as e:
        print(f"Error processing {name}: {e}")

Found 10 matches:
Found 1 matches:
Found 11 matches:
Found 6 matches:
Found 12 matches:
Found 6 matches:
Found 6 matches:
Found 27 matches:
Found 9 matches:
Found 20 matches:
Found 5 matches:
Found 9 matches:
Found 15 matches:
Found 7 matches:
Found 7 matches:
Found 7 matches:
Found 62 matches:
Found 22 matches:
Found 4 matches:
Found 6 matches:
Found 6 matches:
Found 8 matches:
Found 10 matches:
Found 45 matches:
Found 10 matches:
Found 6 matches:
Found 6 matches:
Found 10 matches:
Found 5 matches:
Found 10 matches:
Found 8 matches:
Found 9 matches:
Found 13 matches:
Found 10 matches:
Found 8 matches:
Found 8 matches:
Found 12 matches:
Found 10 matches:
Found 12 matches:
Found 9 matches:
Found 5 matches:
Found 22 matches:
Found 5 matches:
Found 4 matches:
Found 6 matches:
Found 8 matches:
Found 6 matches:
Found 55 matches:
Found 36 matches:
Found 19 matches:
Found 16 matches:
Found 14 matches:
Found 9 matches:
Found 12 matches:
Found 6 matches:
Found 3 matches:
Found 26 matches:
Found

In [None]:
#4) voting policy for all pdfs
db_name = "results/voting_policy.db"
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
PDFHandler.create_tables(conn)

for path, name in zip(all_file_paths, all_file_names):
    doc = PDFHandler.try_open(path)
    if doc is None:
        print(f"Error opening the PDF: {name}")
        continue

    pdf_name = os.path.basename(path)
    
    cursor.execute("SELECT id FROM pdfs WHERE pdf_name = ?", (pdf_name,))
    pdf_id = cursor.fetchone()
        
    if not pdf_id:
        cursor.execute("INSERT INTO pdfs (pdf_name) VALUES (?)", (pdf_name,))
        pdf_id = cursor.lastrowid
    
    text, page_count, size_mode = PDFHandler.tagged_text_extraction(doc)
    text = PDFHandler.default_pdf_cleaning(text)

    try:
        section_type = df[df['pdf_name'] == name[:-4]]['section_type'].values[0]
        if section_type not in PDFHandler.regex_patterns:
            # print(f"Section type {section_type} not found in regex patterns.")
            continue
        sections = PDFHandler.find_pattern_in_text(text, PDFHandler.regex_patterns[section_type], debug=True)
        sections = PDFHandler.voting_policy(sections, size_mode)

        PDFHandler.insert_section_into_sqlite(conn, sections, pdf_id)

    except Exception as e:
        print(f"Error processing {name}: {e}")

In [5]:
def segment2prompt(sections: list) -> str:
   prompt = """Analyze the following potential section titles extracted by regex pattern matching. Some may be actual section headers while others could be false positives (table entries, references, footnotes, etc.).

Please identify which ones are most likely to be legitimate section titles for an academic paper or document:

"""
   
   for idx, section in enumerate(sections):
       prompt += f"({idx}) {section.section_number}. {section.section_title}\n"
   
   prompt += """
            EVALUATION CRITERIA:
            - Look for typical section patterns (Introduction, Methods, Results, Discussion, Conclusion, etc.)
            - Consider formatting consistency and logical flow
            - Exclude obvious false positives like:
            - Table captions or figure titles
            - Reference entries or citations
            - Page headers/footers
            - Numbered lists within paragraphs
            - Partial sentences or fragments

            Please respond with a JSON object containing the indices of legitimate section titles:

            {
            "selected_sections": [0, 2, 5, 8]
            }

            Response:"""
   
   return prompt

In [6]:
def extract_selected_sections(response_text: str) -> list:
    # Procura por padrão JSON no texto
    json_pattern = r'\{[^}]*"selected_sections"[^}]*\[[^\]]*\][^}]*\}'
    match = re.search(json_pattern, response_text)
    
    if match:
        try:
            json_data = json.loads(match.group())
            return json_data.get("selected_sections", [])
        except json.JSONDecodeError:
            return []
    return []

In [7]:
OLLAMA_URL = "http://localhost:11434"

def ask_llm(prompt, context: list[str] = [], model="deepseek-r1:1.5b"):
    try:

        if isinstance(prompt, list):
            prompt = "\n".join(str(item) for item in prompt)

        if isinstance(context, list):
            context = "\n".join(str(item) for item in context)

        data = {
            "model": model,
            "prompt": prompt
        }

        response = requests.post(
            f'{OLLAMA_URL}/api/generate',
            json=data,
            timeout=10,
            stream=False
        )
        response.raise_for_status()
        
        full_response = ""
        for line in response.text.splitlines():
            if line.strip():
                try:
                    json_response = json.loads(line)
                    if 'response' in json_response:
                        full_response += json_response['response']
                except json.JSONDecodeError:
                    continue
        
        return full_response

    except requests.exceptions.ConnectionError:
        return "Error: Cannot connect to Ollama server"
    except requests.exceptions.RequestException as e:
        return f"Error: Request failed: {str(e)}"

In [8]:
#5) Local llms to analyze the generic regex

model = "llama3:8b"
db_name = f"results/local_llms{model}.db"
conn = sqlite3.connect(db_name)
cursor = conn.cursor()

PDFHandler.create_tables(conn)
start = time.time()
for path, name in zip(all_file_paths, all_file_names):
    
    
    doc = PDFHandler.try_open(path)
    if doc is None:
        print(f"Error opening the PDF: {name}")
        continue

    pdf_name = os.path.basename(path)
    
    cursor.execute("SELECT id FROM pdfs WHERE pdf_name = ?", (pdf_name,))
    pdf_id = cursor.fetchone()
        
    if not pdf_id:
        cursor.execute("INSERT INTO pdfs (pdf_name) VALUES (?)", (pdf_name,))
        pdf_id = cursor.lastrowid

    text, page_count = PDFHandler.simple_extraction(doc)
    text = PDFHandler.default_pdf_cleaning(text)
    try:
        sections = PDFHandler.find_pattern_in_text(text, PDFHandler.regex_patterns["generic_section_title"])
        prompt = segment2prompt(sections)
        answare = ask_llm(prompt, model="llama3:8b")
        if answare.startswith("Error:") or answare == "":
            print(f"LLM error for {name}: {answare}")
            continue
        position_list = extract_selected_sections(answare)
        if not position_list:
            print(f"No valid sections found for {name} using LLM.")
            continue
        sections = [sections[i] for i in position_list if i < len(sections)]
        PDFHandler.insert_section_into_sqlite(conn, sections, pdf_id)

    except Exception as e:
        print(f"Error processing {name}: {e}")

No valid sections found for Arcaini2020.pdf using LLM.
No valid sections found for Švogor2019.pdf using LLM.
No valid sections found for Ha2019-icse.pdf using LLM.
No valid sections found for Liu2022.pdf using LLM.
No valid sections found for Temple2021.pdf using LLM.
No valid sections found for Weber2021.pdf using LLM.
No valid sections found for schmid2022.pdf using LLM.
No valid sections found for hugo2021-tse.pdf using LLM.
No valid sections found for shaghayegh2022-splc.pdf using LLM.
No valid sections found for liang2024cc.pdf using LLM.


In [None]:
def upload_pdf_to_gemini(pdf_path):
        """Faz upload do PDF para o Gemini"""
        try:
            # Upload do arquivo para o Gemini
            uploaded_file = genai.upload_file(pdf_path)
            print(f"Arquivo enviado: {uploaded_file.name}")
            
            # Aguarda o processamento do arquivo
            while uploaded_file.state.name == "PROCESSING":
                print("Processando arquivo...")
                time.sleep(2)
                uploaded_file = genai.get_file(uploaded_file.name)
            
            if uploaded_file.state.name == "FAILED":
                raise ValueError("Falha no processamento do arquivo")
                
            return uploaded_file
        except Exception as e:
            print(f"Erro no upload do PDF: {e}")
            return None

In [None]:

#6) global llms 
db_name = "results/extern_llm-gemini.db"
model_names = ["gemini-2.5-flash-preview-05-20", "gemini-2.0-flash", "gemini-2.0-flash-lite"]
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
load_dotenv()
api_key = os.getenv("GEMINI_API")

# PDFHandler.create_tables(conn)
start = time.time()
genai.configure(api_key=api_key)


model = genai.GenerativeModel('gemini-2.5-flash-preview-05-20')
PDFHandler.create_tables(conn)

for path, name in zip(all_file_paths, all_file_names):
    upload_pdf = upload_pdf_to_gemini(path)

    prompt = PDFHandler.llm_prompt

    if upload_pdf is None:
        print(f"Error uploading PDF: {name}")
        continue
    try:
        doc = PDFHandler.try_open(path)
        if doc is None:
            print(f"Error opening the PDF: {name}")
            continue
        text, page_count = PDFHandler.simple_extraction(doc)
        response = model.generate_content([prompt, upload_pdf])
        response_text = response.text
        sections_data = json.loads(response_text)

        pattern = r'\{.*?\}'
        match = re.findall(pattern, response_text, re.DOTALL)

        if not match:
            print(f"No valid JSON sections found in response for {name}.")
            continue
    
        else:
            try:
                retrived_json = json.loads(match) 
            except:
                print("Could not load json")
                continue
        

        section_arr = []
        for section in retrived_json.get("sections", []):
            section_arr.append(PDFHandler.find_pattern_in_text(text, section))
        
        pdf_name = os.path.basename(path)
        
        cursor.execute("SELECT id FROM pdfs WHERE pdf_name = ?", (pdf_name,))
        pdf_id = cursor.fetchone()
            
        if not pdf_id:
            cursor.execute("INSERT INTO pdfs (pdf_name) VALUES (?)", (pdf_name,))
            pdf_id = cursor.lastrowid

        PDFHandler.insert_section_into_sqlite(conn, section_arr, pdf_id)
        break
    except Exception as e:
        print(f"Error processing {name} with Gemini: {e}")
        continue

2 + 2 = 4


In [None]:
#7) ORC extraction
db_name = "results/ORC_tag_extraction.db"
conn = sqlite3.connect(db_name)
cursor = conn.cursor()

PDFHandler.create_tables(conn)
start = time.time()
for path, name in zip(all_file_paths, all_file_names):

    pdf_name = os.path.basename(path)
    
    cursor.execute("SELECT id FROM pdfs WHERE pdf_name = ?", (pdf_name,))
    pdf_id = cursor.fetchone()
        
    if not pdf_id:
        cursor.execute("INSERT INTO pdfs (pdf_name) VALUES (?)", (pdf_name,))
        pdf_id = cursor.lastrowid

    try:
        sections = PDFHandler.orc_extraction_html(path)
        if not sections:
            print(f"No sections found for {name} using ORC extraction.")
            continue
        sections = [section for section in sections if section.section_title.strip() != ""]

        PDFHandler.insert_section_into_sqlite(conn, sections, pdf_id)

    except Exception as e:
      print(f"Error processing {name}: {e}")