In [2]:
# Read all pdf in knowledge base and convert to text
import os
import re
import sys

import fitz  # PyMuPDF
from tqdm import tqdm

def get_knowledge_base_path():
    """
    Get the path to the knowledge base directory.
    """
    # Replace with your actual knowledge base path
    return "Knowledge Base/"

def get_knowledge_base_files(path, extension):
    """
    Get all files in the knowledge base directory with the specified extension.
    """
    return [f for f in os.listdir(path) if f.endswith(extension)]

def get_knowledge_base_text_path():
    """
    Get the path to the directory where text files will be saved.
    """
    # Replace with your actual text file path
    return "/"
def get_knowledge_base_text_file_path(file_name):
    """
    Get the full path for the text file to be saved.
    """
    # Replace with your actual text file path
    return os.path.join(get_knowledge_base_text_path(), file_name)

def write_to_file(file_path, content):
    """
    Write content to a file.
    """
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)
    except Exception as e:
        print(f"Error writing to {file_path}: {e}")

def convert_pdf_to_text(pdf_path):
    """
    Convert a PDF file to text using PyMuPDF.
    """
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

def main():
    # Get the knowledge base path
    knowledge_base_path = get_knowledge_base_path()
    if not os.path.exists(knowledge_base_path):
        print(f"Knowledge base path does not exist: {knowledge_base_path}")
        sys.exit(1)

    # Get all PDF files in the knowledge base
    pdf_files = get_knowledge_base_files(knowledge_base_path, ".pdf")
    if not pdf_files:
        print("No PDF files found in the knowledge base.")
        sys.exit(0)

    # Create a directory for the text files
    text_dir = get_knowledge_base_text_path()
    os.makedirs(text_dir, exist_ok=True)

    # Convert each PDF file to text
    for pdf_file in tqdm(pdf_files, desc="Converting PDFs to text"):
        pdf_path = os.path.join(knowledge_base_path, pdf_file)
        text = convert_pdf_to_text(pdf_path)
        if text:
            text_file_name = re.sub(r'\.pdf$', '.txt', pdf_file)
            text_file_path = get_knowledge_base_text_file_path(text_file_name)
            write_to_file(text_file_path, text)  # Save the text to a file
            print(f"Converted {pdf_file} to {text_file_name}")
        else:
            print(f"Failed to convert {pdf_file} to text.")

if __name__ == "__main__":
    main()


Converting PDFs to text:  33%|███▎      | 3/9 [00:00<00:00, 20.59it/s]

Error writing to /Bangalore _ Bengaluru - Indiranagar _ Barbeque Nation.txt: [Errno 13] Permission denied: '/Bangalore _ Bengaluru - Indiranagar _ Barbeque Nation.txt'
Converted Bangalore _ Bengaluru - Indiranagar _ Barbeque Nation.pdf to Bangalore _ Bengaluru - Indiranagar _ Barbeque Nation.txt
Error writing to /Bangalore _ Electronic City _ Barbeque Nation.txt: [Errno 13] Permission denied: '/Bangalore _ Electronic City _ Barbeque Nation.txt'
Converted Bangalore _ Electronic City _ Barbeque Nation.pdf to Bangalore _ Electronic City _ Barbeque Nation.txt
Error writing to /Bangalore _ JP Nagar _ Barbeque Nation.txt: [Errno 13] Permission denied: '/Bangalore _ JP Nagar _ Barbeque Nation.txt'
Converted Bangalore _ JP Nagar _ Barbeque Nation.pdf to Bangalore _ JP Nagar _ Barbeque Nation.txt
Error writing to /Bangalore _ Koramangala 1st Block _ Barbeque Nation.txt: [Errno 13] Permission denied: '/Bangalore _ Koramangala 1st Block _ Barbeque Nation.txt'
Converted Bangalore _ Koramangala 1st

Converting PDFs to text: 100%|██████████| 9/9 [00:00<00:00, 21.51it/s]

Error writing to /Menu List _ Barbeque Nation.txt: [Errno 13] Permission denied: '/Menu List _ Barbeque Nation.txt'
Converted Menu List _ Barbeque Nation.pdf to Menu List _ Barbeque Nation.txt
Error writing to /New Delhi - Connaught Place _ CP _ cp _ Barbeque Nation.txt: [Errno 13] Permission denied: '/New Delhi - Connaught Place _ CP _ cp _ Barbeque Nation.txt'
Converted New Delhi - Connaught Place _ CP _ cp _ Barbeque Nation.pdf to New Delhi - Connaught Place _ CP _ cp _ Barbeque Nation.txt
Error writing to /New Delhi - Sector C, Vasant Kunj _ Barbeque Nation.txt: [Errno 13] Permission denied: '/New Delhi - Sector C, Vasant Kunj _ Barbeque Nation.txt'
Converted New Delhi - Sector C, Vasant Kunj _ Barbeque Nation.pdf to New Delhi - Sector C, Vasant Kunj _ Barbeque Nation.txt
Error writing to /New Delhi - Unity Mall, Janakpuri _ Barbeque Nation.txt: [Errno 13] Permission denied: '/New Delhi - Unity Mall, Janakpuri _ Barbeque Nation.txt'
Converted New Delhi - Unity Mall, Janakpuri _ Bar


