In [11]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.prompts.prompts import SimpleInputPrompt
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.core import KnowledgeGraphIndex
from llama_index.core import StorageContext
from llama_index.graph_stores.neo4j import Neo4jGraphStore
from llama_index.core import Document

from PIL import Image
import pytesseract
import os
import fitz

In [12]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\Yazat\AppData\Local\tesseract.exe"

def ocr_image(file_path):
    return pytesseract.image_to_string(Image.open(file_path))

def read_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def read_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()
    
def read_pdf(file_path):
    """Extract text from a PDF file."""
    pdf_document = fitz.open(file_path)
    text = ""
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text

def load_documents_with_ocr(directory):
    documents = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isdir(file_path):
            continue  # Skip directories within the main directory
        if filename.endswith(".pdf"):
            try:
                pdf_text = read_pdf(file_path)
                documents.append(Document(text=pdf_text, metadata={'file_name': filename}))
            except Exception as e:
                print(f"Error loading PDF: {file_path}, {e}")
        elif filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
            ocr_text = ocr_image(file_path)
            documents.append(Document(text=ocr_text, metadata={'file_name': filename}))
        elif filename.endswith(".docx"):
            docx_text = read_docx(file_path)
            documents.append(Document(text=docx_text, metadata={'file_name': filename}))
        elif filename.endswith(".txt"):
            txt_text = read_txt(file_path)
            documents.append(Document(text=txt_text, metadata={'file_name': filename}))
    return documents

documents = load_documents_with_ocr(r"C:\Users\Yazat\Desktop\RAG\pdf")

print(documents)

[Document(id_='b502bb76-7c73-4bfd-8b54-d5f0ee4efc1b', embedding=None, metadata={'file_name': 'draft.txt'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="Yazat's full name is yazat mishra. He is 21 years old", mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), Document(id_='4df27428-ddfb-48dd-bc35-087848299b85', embedding=None, metadata={'file_name': 'image.png'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Yazat is from Delhi, he has pursued his high school education from Amity Noida.|\n\n', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), Document(id_='c5f6618a-a40f-468e-a199-e56f8ec04f7a', embedding=None, metadata={'file_name': 'rank.pdf'}, excluded_embed_metadata

In [13]:
system_prompt = """
    You are a technical assistant whose goal is to answer 
    questions based on the instructions 
    and the context provided. 
    If you do not have an answer from 
    the provided information say so.
    """
    
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

In [14]:
llm = Ollama(model="llama3:8b", request_timeout = 20000)
embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
)
chunk_overlap = 30

Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_overlap = chunk_overlap


In [18]:
username = "neo4j"
password = "12345678"
url = "neo4j://localhost:7687"
database = "neo4j"

graph_store = Neo4jGraphStore(
    username=username,
    password=password,
    url=url,
    database=database,
)

storage_context = StorageContext.from_defaults(graph_store=graph_store)

index = KnowledgeGraphIndex.from_documents(
    documents, 
    storage_context=storage_context,
    llm=llm,
    embed_model=embed_model,
    show_progress=True,
    include_embeddings=True
)

print("KnowledgeGraphIndex created successfully")

Parsing nodes: 100%|██████████| 3/3 [00:00<00:00, 143.32it/s]
Generating embeddings: 100%|██████████| 2/2 [00:00<00:00,  2.56it/s]
Generating embeddings: 100%|██████████| 2/2 [00:00<00:00,  4.59it/s]
Generating embeddings: 100%|██████████| 8/8 [00:00<00:00, 10.78it/s]
Processing nodes: 100%|██████████| 3/3 [27:39<00:00, 553.01s/it]

KnowledgeGraphIndex created successfully





In [19]:
# from IPython.display import Markdown, display, HTML

query_engine = index.as_query_engine(streaming=True)
streaming_response = query_engine.query("Tell me where has Yazat undergone his high school and where is he currently studying from?")
streaming_response.print_response_stream()

# response = query_engine.query(
#     "Tell me where has Yazat undergone his high school and where is he currently studying from?"
# )

# display(Markdown(f"<b>{response}</b>"))

Yazat underwent his high school at Amity Noida. He is currently studying at National Institute of Technology Mizoram.