In [1]:
import os
import PyPDF2
import spacy
from spacy import displacy
from tkinter import Tk, filedialog

# Load the larger pre-trained model
nlp = spacy.load("en_core_web_lg")



In [2]:
# Function to extract text from a PDF file
def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text


In [3]:

# Function for named entity recognition
def recognize_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    return entities


In [4]:

# Function to display named entities and save to a file
def display_entities(text, output_file):
    doc = nlp(text)
    html = displacy.render(doc, style='ent', jupyter=False)
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(html)


In [5]:

# Function to save entities as JSON
def save_entities_as_json(entities, output_file):
    with open(output_file, 'w') as file:
        json.dump(entities, file)


In [6]:

# Function to save entities as CSV
def save_entities_as_csv(entities, output_file):
    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Entity', 'Label'])
        writer.writerows(entities)


In [None]:

# Create Tkinter root window
root = Tk()
root.withdraw()

# Show a file dialog to select the input PDF file or directory
file_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf")])

if file_path:
    # Provide the output format
    output_format = "html"

    if os.path.isfile(file_path):
        # Single file processing
        try:
            # Extract text from the PDF
            pdf_text = extract_text_from_pdf(file_path)

            # Perform named entity recognition on the extracted text
            entities = recognize_entities(pdf_text)
            print(entities)

            # Output the recognized entities based on the specified format
            output_file = 'entities_output'
            if output_format == 'html':
                display_entities(pdf_text, output_file + '.html')
                print(f"Named entities saved to {output_file}.html")
            elif output_format == 'json':
                save_entities_as_json(entities, output_file + '.json')
                print(f"Named entities saved to {output_file}.json")
            elif output_format == 'csv':
                save_entities_as_csv(entities, output_file + '.csv')
                print(f"Named entities saved to {output_file}.csv")
        except Exception as e:
            print(f"An error occurred while processing the file: {file_path}")
    else:
        # Batch processing for a directory
        if os.path.isdir(file_path):
            try:
                for file_name in os.listdir(file_path):
                    file = os.path.join(file_path, file_name)
                    if file_name.endswith(".pdf") and os.path.isfile(file):
                        try:
                            # Extract text from the PDF
                            pdf_text = extract_text_from_pdf(file)

                            # Perform named entity recognition on the extracted text
                            entities = recognize_entities(pdf_text)

                            # Output the recognized entities based on the specified format
                            output_file = os.path.splitext(file_name)[0] + '_entities_output'
                            if output_format == 'html':
                                display_entities(pdf_text, output_file + '.html')
                                print(f"Named entities for {file_name} saved to {output_file}.html")
                            elif output_format == 'json':
                                save_entities_as_json(entities, output_file + '.json')
                                print(f"Named entities for {file_name} saved to {output_file}.json")
                            elif output_format == 'csv':
                                save_entities_as_csv(entities, output_file + '.csv')
                                print(f"Named entities for {file_name} saved to {output_file}.csv")
                        except Exception as e:
                            print(f"An error occurred while processing the file: {file_name}")
                    else:
                        print(f"Skipping {file_name} as it is not a valid PDF file.")
            except Exception as e:
                print(f"An error occurred while processing the directory: {file_path}")
        else:
            print("Invalid input. Please select a valid PDF file or directory.")
else:
    print("No file or directory selected.")

# Close the Tkinter root window
root.destroy()
