In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import re
import json
from PyPDF2 import PdfReader

# List of entities to find (no complex structures, just basic lists)
entities = [
    "Camera dei Deputati",
    "Senato della Repubblica",
    "Presidente della Repubblica",
    "Governo",
    "Regione",
    "Parlamento",
    "Consiglio superiore della magistratura",
    "Consiglio"
]

# Pattern to match legal references (very basic)
legal_ref_pattern = r'l\. cost\. \d{1,2} \w+ \d{4}, n\.\d+'

def read_pdf(pdf_path):
    """Reads the PDF and extracts the text from it."""
    reader = PdfReader(pdf_path)
    all_text = ""
    # Go through all the pages and get the text from each page
    for page in reader.pages:
        all_text += page.extract_text()
    return all_text

def find_entities(text):
    """Finds entities in the text and returns their positions."""
    found_entities = []
    for entity in entities:
        matches = re.finditer(entity, text)
        for match in matches:
            # Get where the entity starts and ends in the text
            found_entities.append({
                "label": entity,
                "span_start": match.start(),
                "span_end": match.end()
            })
    return found_entities

def find_legal_references(text):
    """Finds legal references using a basic regular expression."""
    found_refs = []
    matches = re.finditer(legal_ref_pattern, text)
    for match in matches:
        found_refs.append({
            "reference": match.group(),
            "span_start": match.start(),
            "span_end": match.end()
        })
    return found_refs

def main():
    # Path to the PDF file
    pdf_file = "/content/Costituzione_ITALIANO.pdf"

    # Read the PDF text
    text = read_pdf(pdf_file)

    # Find entities in the text
    entities_result = find_entities(text)

    # Find legal references in the text
    legal_refs_result = find_legal_references(text)

    # Combine everything in one dictionary (super simple)
    result = {
        "entities": entities_result,
        "legal_references": legal_refs_result
    }

    # Save the result to a JSON file (plain write to file)
    with open("output.json", "w", encoding='utf-8') as f:
        json.dump(result, f, indent=4)

    # Let the user know that the work is done
    print("Done! The results are saved in 'output.json'.")

if __name__ == "__main__":
    main()


Done! The results are saved in 'output.json'.
