# Libraries and Dependencies

In [None]:
# !pip install pytesseract fitz pillow spacy -q pypdf python-dotenv langchain-community chromadb huggingface_hub -q
# !python -m spacy download en_core_web_sm -q
# !apt install tesseract-ocr
# !apt install libtesseract-dev
# !pip install pymupdf --no-cache-dir # Reinstall, using --no-cache-dir to skip cached installations

# Converting the Scanned PDF into Structured PDF

In [None]:
import os
import textwrap
import fitz
from PIL import Image
import pytesseract
import spacy

## Process Input File
The following function processes different file formats (PDF, JPG, PNG, TXT) and extract the text content from them using appropriate methods.

Arguments: 
- file_path : str : The path to the file to be processed

Returns:
- str : Extracted text from the file

In [None]:
def process_file(file_path):
    # Get the file extension to determine file type
    ext = os.path.splitext(file_path)[1].lower()

    if ext == '.pdf':
        # If it's a PDF, convert to images and use tesseract
        doc = fitz.open(file_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            pix = page.get_pixmap()
            img_path = f"page_{page_num}.png"
            pix.save(img_path)
            text += pytesseract.image_to_string(Image.open(img_path))
        return text

    elif ext in ['.jpg', '.jpeg', '.png']:
        # If it's an image, directly use tesseract
        return pytesseract.image_to_string(Image.open(file_path))

    elif ext == '.txt':
        # If it's a text file, simply read the text
        with open(file_path, 'r') as file:
            return file.read()

    else:
        raise ValueError("Unsupported file type")

Provide the file path for the file you want to process

In [6]:
# Example usage:
file_path = "scansmpl.jpg" #"scansmpl.pdf"
text = process_file(file_path)

In [7]:
# Analyze and Structure the text
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

structured_text = []
for sent in doc.sents:
    structured_text.append(sent.text)


## QA System
Following section focuses on setting up a question-answering system that leverages several powerful libraries from LangChain and HuggingFace.

In [21]:
import sys
from dotenv import load_dotenv, find_dotenv

from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceEmbeddings

# Load environment variables from a .env file
sys.path.append('../..')
_ = load_dotenv(find_dotenv())

# Set up HuggingFace embeddings and LLM
embedding = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device': 'cuda'},
    encode_kwargs={'device': 'cuda'}
)

# Retrieve HuggingFace API token from environment variables (TKN is the key in the .env file)
HUGGINGFACEHUB_API_TOKEN = os.getenv("TKN")
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    temperature=0.5,
)

# Define the directory to persist vectorstore data
persist_directory = "./p_dir"

# Load and split the PDF document
loader = PyPDFLoader(file_path="output.pdf")
documents = loader.load()

text_splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=30,
    separator="\n"
)
documents = text_splitter.split_documents(documents=documents)

# Create a Chroma vectorstore from the documents
vectordb = Chroma.from_documents(
    documents=documents,
    embedding=embedding,
    persist_directory=persist_directory
)

# Set up the retriever from the vectorstore
retriever = vectordb.as_retriever(search_kwargs={"k": 1})

# Create the RetrievalQA chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever
)


Example use:

In [24]:
response = qa.run("Where is the registration office located?")

print(response)

 The registration office for the CROSS Group is located at 40 Vicara Lane, Ilford, Essex.
