In [None]:
import os
import fitz  # PyMuPDF for PDF processing
import pytesseract  # Tesseract for OCR
import textract  # Extract text from various document formats
import spacy  # NLP library
from spacy.lang.en import English
from spacy.pipeline import EntityRecognizer
from spacy import displacy
import re

In [None]:
# Initialize Spacy NLP model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Define a function to extract text from various document formats
def extract_text_from_document(file_path):
    # Extract text from PDFs using PyMuPDF
    if file_path.endswith(".pdf"):
        pdf_document = fitz.open(file_path)
        pdf_text = ""
        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            pdf_text += page.get_text()
        return pdf_text
         # Extract text from common document formats using textract
    elif file_path.endswith((".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls")):
        text = textract.process(file_path).decode("utf-8")
        return text
    
     # If none of the above formats, try using Tesseract for OCR
    else:
        text = pytesseract.image_to_string(file_path)
        return text

In [None]:
# a function to perform NLP processing on extracted text
def nlp_processing(text):
    # Tokenize and analyze text using Spacy
    doc = nlp(text)

    # Extract entities (e.g., names, dates, locations)
    entities = []
    for ent in doc.ents:
        entities.append({"text": ent.text, "label": ent.label_})

    # Extract keywords (e.g., important terms)
    keywords = []
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.is_space:
            keywords.append(token.text)

    # Perform additional NLP tasks as needed (e.g., sentiment analysis, summarization)

    return {
        "entities": entities,
        "keywords": keywords,
    }

In [None]:
# Define a function to process a document and perform OCR and NLP
def process_document(file_path):
    # Extract text from the document
    extracted_text = extract_text_from_document(file_path)

    # Perform NLP processing on the extracted text
    nlp_result = nlp_processing(extracted_text)

    return {
        "text": extracted_text,
        "nlp_result": nlp_result,
    }

In [None]:
# Example usage:
if __name__ == "__main__":
    document_path = "path_to_your_document.pdf"
    result = process_document(document_path)

    # Specify the file path to save the results
    output_file_path = "output_results.txt"

    # Write the results to the specified file
    write_results_to_file(output_file_path, result)

    print(f"Results saved to {output_file_path}")

In [None]:
#It loads text documents from a specified directory, so you should place your documents in that directory.
#Users can input queries, and the bot will search for answers in all loaded documents.
#The bot will provide answers based on BERT's understanding of the text within the documents.

In [None]:
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
import os

In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

In [None]:
# Function to process a document and return its text
def extract_text_from_nlp_document(document_path_of):
    with open(document_path, "r", encoding="utf-8") as file:
        document_text = file.read()
    return document_text


In [None]:
# Function to answer user queries based on document content
def answer_user_query(document_text, user_query):
    # Tokenize the input
    input_ids = tokenizer.encode(user_query, document_text)

    # Convert tokenized input to tensors
    input_ids = torch.tensor(input_ids).unsqueeze(0)

    # Get BERT model's predictions
    start_scores, end_scores = model(input_ids)

    # Find the answer span with the highest probability
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Convert token IDs back to text
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][answer_start:answer_end + 1]))

    return answer

if __name__ == "__main__":
    # Specify the directory where your documents are located
    documents_directory = "path_to_documents_directory"

    # Load all documents from the specified directory
    document_texts = {}
    for filename in os.listdir(documents_directory):
        if filename.endswith(".txt"):
            document_path = os.path.join(documents_directory, filename)
            document_text = extract_text_from_nlp_document(output_file_path)
            document_texts[filename] = document_text

    while True:
        user_query = input("Ask a question (or type 'exit' to quit): ")

        if user_query.lower() == "exit":
            break

        # Iterate through the loaded documents and find answers
        for doc_filename, doc_text in document_texts.items():
            answer = answer_user_query(doc_text, user_query)
            print(f"Document: {doc_filename}")
            print("Answer:", answer)