In [1]:
# Code taken from the Unstructured library https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/cleaners/core.py

import re

UNICODE_BULLETS = [
    "\u0095",
    "\u2022",
    "\u2023",
    "\u2043",
    "\u3164",
    "\u204C",
    "\u204D",
    "\u2219",
    "\u25CB",
    "\u25CF",
    "\u25D8",
    "\u25E6",
    "\u2619",
    "\u2765",
    "\u2767",
    "\u29BE",
    "\u29BF",
    "\u002D",
    "",
    "\*", 
    "\x95",
    "·",
]

BULLETS_PATTERN = "|".join(UNICODE_BULLETS)

UNICODE_BULLETS_RE = re.compile(f"(?:{BULLETS_PATTERN})(?!{BULLETS_PATTERN})")

PARAGRAPH_PATTERN = r"\s*\n\s*"  # noqa: W605 NOTE(harrell)

PARAGRAPH_PATTERN_RE = re.compile(
    f"((?:{BULLETS_PATTERN})|{PARAGRAPH_PATTERN})(?!{BULLETS_PATTERN}|$)",
)
DOUBLE_PARAGRAPH_PATTERN_RE = re.compile("(" + PARAGRAPH_PATTERN + "){2}")

E_BULLET_PATTERN = re.compile(r"^e(?=\s)", re.MULTILINE)


def clean_non_ascii_chars(text) -> str:
    """Cleans non-ascii characters from unicode string.

    Example
    -------
    \x88This text contains non-ascii characters!\x88
        -> This text contains non-ascii characters!
    """
    en = text.encode("ascii", "ignore")
    return en.decode()

def clean_bullets(text: str) -> str:
    """Cleans unicode bullets from a section of text.

    Example
    -------
    ●  This is an excellent point! -> This is an excellent point!
    """
    search = UNICODE_BULLETS_RE.match(text)
    if search is None:
        return text

    cleaned_text = UNICODE_BULLETS_RE.sub(" ", text, 1)
    return cleaned_text.strip()

def clean_extra_whitespace(text: str) -> str:
    """Cleans extra whitespace characters that appear between words.

    Example
    -------
    ITEM 1.     BUSINESS -> ITEM 1. BUSINESS
    """
    cleaned_text = re.sub(r"[\xa0\n]", " ", text)
    cleaned_text = re.sub(r"([ ]{2,})", " ", cleaned_text)
    return cleaned_text.strip()

def group_broken_paragraphs(
    text: str,
    line_split: re.Pattern[str] = PARAGRAPH_PATTERN_RE,
    paragraph_split: re.Pattern[str] = DOUBLE_PARAGRAPH_PATTERN_RE,
) -> str:
    """Groups paragraphs that have line breaks for visual/formatting purposes.
    For example:

    '''The big red fox
    is walking down the lane.

    At the end of the lane
    the fox met a bear.'''

    Gets converted to

    '''The big red fox is walking down the lane.
    At the end of the land the fox met a bear.'''
    """
    paragraphs = paragraph_split.split(text)
    clean_paragraphs = []
    for paragraph in paragraphs:
        if not paragraph.strip():
            continue
        para_split = line_split.split(paragraph)
        all_lines_short = all(len(line.strip().split(" ")) < 5 for line in para_split)
        if UNICODE_BULLETS_RE.match(paragraph.strip()) or E_BULLET_PATTERN.match(paragraph.strip()):
            clean_paragraphs.extend(group_bullet_paragraph(paragraph))
        elif all_lines_short:
            clean_paragraphs.extend([line for line in para_split if line.strip()])
        else:
            clean_paragraphs.append(re.sub(PARAGRAPH_PATTERN, " ", paragraph))

    return "\n\n".join(clean_paragraphs)

def merge_hyphenated_words(text):
    """
    Merges incorrectly hyphenated words in a given text.

    This function uses a regular expression to identify occurrences where a word has been split by
    a hyphen followed by whitespace, such as in 'import- ant'. It merges these split parts into a
    single word, effectively correcting the text to appear as 'important'.

    Parameters:
        text (str): The text containing hyphenated words to be merged.

    Returns:
        str: The corrected text with all hyphenated words merged.

    Example:
        corrected_text = merge_hyphenated_words("The document was import- ant for the meeting.")
        print(corrected_text)  # Output: "The document was important for the meeting."
    """
    # Regular expression to find hyphenated words
    pattern = r'(\w+)-\s+(\w+)'
    # Replace the found patterns by merging the two groups
    corrected_text = re.sub(pattern, r'\1\2', text)
    return corrected_text

remove_citations = lambda text: re.sub("\[\d{1,3}\]", "", text)

def clean(
    text: str,
    extra_whitespace: bool = False,
    broken_paragraphs: bool = False,
    bullets: bool = False,
    ascii: bool = False,
    lowercase: bool = False,
    citations: bool = False,
    merge_split_words: bool = False,

) -> str:
    """Cleans text.

    """

    cleaned_text = text.lower() if lowercase else text
    cleaned_text = (
        clean_non_ascii_chars(cleaned_text) if ascii else cleaned_text
    )
    cleaned_text = remove_citations(cleaned_text) if citations else cleaned_text
    cleaned_text = clean_extra_whitespace(cleaned_text) if extra_whitespace else cleaned_text
    cleaned_text = clean_bullets(cleaned_text) if bullets else cleaned_text
    cleaned_text = merge_hyphenated_words(cleaned_text) if merge_split_words else cleaned_text
    return cleaned_text.strip()

  "\*",
  remove_citations = lambda text: re.sub("\[\d{1,3}\]", "", text)


In [2]:
import os
import sys
import getpass
import nest_asyncio
import fitz
from dotenv import load_dotenv 

nest_asyncio.apply()

load_dotenv()

#sys.path.append('../helpers')

#from text_cleaning_helpers import clean

True

In [None]:
#pip install frontend

In [3]:
PDF_PATH = "FAQ.pdf"

LLMSHERPA_API_URL = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"

In [None]:
#!pip install llama_index

In [None]:
#!pip install llama_index.readers.smart_pdf_loader

In [4]:
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import PDFReader
from llama_index.readers.smart_pdf_loader import SmartPDFLoader


#pdf_reader_docs = PDFReader().load_data(PDF_PATH)
#smart_pdf_loader_docs = SmartPDFLoader(llmsherpa_api_url=LLMSHERPA_API_URL).load_data(PDF_PATH)
simple_directory_reader_docs = SimpleDirectoryReader(input_files=[PDF_PATH]).load_data()

In [None]:
#pip install fitz

In [9]:
#mkdir static


A subdirectory or file static already exists.


In [None]:
#pip install tools

In [None]:
#pip install pymupdf

In [None]:
#pip install --upgrade PyMuPDF


In [5]:
import fitz
document = fitz.open(PDF_PATH)

def extract_text(document, opt="text"):
    '''Extract text from a page and returns a list of strings'''
    text = document.get_text(opt, sort=True) 
    text = text.split("\n")
    return text

pages = [extract_text(page) for page in document]

In [6]:
def get_document(file_path, pages):
    """
    Opens a PDF file and optionally selects specific pages to create a document object.

    This function utilizes the `fitz` library to open a PDF file located at `file_path`. 
    If a list of `pages` is provided, the function selects only these pages from the document.
    This is useful for focusing on certain parts of a PDF without loading the entire document into memory.

    Parameters:
        file_path (str): The path to the PDF file to be opened.
        pages (list of int, optional): A list of page numbers to select from the PDF. 
            If `None`, the entire document is loaded.

    """
    document = fitz.open(file_path)
    if pages is not None:
        document.select(pages)  # Select specific pages if pages are provided
    return document


def handle_chapter_headers_footers(strings, flag):
    """
    Modify a list of strings based on a specified flag and join them into a single string.

    This function first removes any empty strings from the input list. It then checks if the
    remaining list has more than three elements. If so, it modifies the list by removing the
    first element, last element, or both, based on the value of the flag. The final list is then
    joined into a single string with spaces separating the elements.

    Parameters:
        strings (list of str): The list of strings to modify.
        flag (str): A flag indicating the modification to perform on the list:
            - 'remove_first': Remove the first element of the list.
            - 'remove_last': Remove the last element of the list.
            - 'remove_first_last': Remove both the first and last elements of the list.
            - 'remove_first_two': Remove the first two elements of the list.
            - Any other value leaves the list unchanged.

    Returns:
        str: A single string composed of the modified list elements, separated by spaces.
    """
    # Filter out empty strings
    filtered_strings = [s for s in strings if s]
    
    # Check if the filtered list has more than three elements
    if len(filtered_strings) > 3:
        if flag == 'remove_first':
            filtered_strings = filtered_strings[1:]  # Slice off the first element
        elif flag == 'remove_last':
            filtered_strings = filtered_strings[:-1]  # Slice off the last element
        elif flag == 'remove_first_last':
            filtered_strings = filtered_strings[1:-1]  # Slice off the first and last elements
        elif flag == 'remove_first_two':
            filtered_strings = filtered_strings[2:]  # Slice off the first two elements
    
    # Join all strings with a space and return the result
    return ' '.join(filtered_strings).strip()

def extract_text(page, file_name, title, author, flag, opt="text"):
    """
    Extracts text from a specified page of a document and returns a dictionary containing
    the extracted text and associated metadata.

    The function first retrieves text from the given `page` object using the specified `opt` method.
    It then processes this text to remove chapter headers, footers, and applies various cleaning
    procedures according to the `flag` and other parameters set in the `clean` function.

    Parameters:
        page (fitz.Page): The page object from which to extract text.
        file_name (str): The name of the file from which the page is taken.
        title (str): The title of the document.
        author (str): The author of the document.
        flag (str): A flag used to customize how chapter headers and footers are handled.
        opt (str, optional): The method of text extraction to be used by `get_text`.
            Defaults to "text", but can be changed to other methods supported by the library.

    Returns:
        dict: A dictionary with two keys:
            - 'text': A string containing the cleaned and processed text from the page.
            - 'metadata': A dictionary containing metadata about the text, including the
                          page number, file name, title, and author.
    """
    
    text = page.get_text(opt, sort=True)

    text = text.split("\n")

    text = handle_chapter_headers_footers(text, flag)

    text = clean(
        text,
        extra_whitespace=True,
        broken_paragraphs=True,
        bullets=True,
        ascii=True,
        lowercase=False,
        citations=True,
        merge_split_words=True,
    )

    return {
        "text": text,
        "metadata": {
            "page_number": page.number,
            "file_name": file_name,
            "title": title,
            "author": author
        }
    }

def extract_texts_from_pdf(file_path, title, author, pages, flag):
    document = get_document(file_path, pages)
    file_name = os.path.basename(file_path)
    extracted_texts = [extract_text(page, file_path, title, author, flag) for page in document]
    return extracted_texts

In [4]:
import os
pdf_files = [
    {
        "file_path": "FAQ.pdf", 
        "title": "FAQ.pdf", 
        "author": "OECD", 
        "pages": list(range(1,10)),
        "flag": "remove_last"
        },
    
    ]
    
all_texts = []

for pdf in pdf_files:
    print(f"Extracting texts from {pdf['title']} by {pdf['author']}...")
    texts = extract_texts_from_pdf(pdf["file_path"], pdf["title"], pdf["author"], pdf["pages"], pdf["flag"])
    print(f"Finished extracting texts from {pdf['title']}.")
    all_texts.extend(texts)

  "file_path": "GLP Documents\GLP_doc_2.pdf",
  "file_path": "GLP Documents\GLP_doc_3.pdf",
  "file_path": "GLP Documents\GLP_doc_4.pdf",
  "file_path": "GLP Documents\GLP_doc_5.pdf",
  "file_path": "GLP Documents\GLP_doc_6.pdf",
  "file_path": "GLP Documents\GLP_doc_7.pdf",
  "file_path": "GLP Documents\GLP_doc_8.pdf",
  "file_path": "GLP Documents\GLP_doc_9.pdf",
  "file_path": "GLP Documents\GLP_doc_10.pdf",
  "file_path": "GLP Documents\GLP_doc_11.pdf",
  "file_path": "GLP Documents\GLP_doc_12.pdf",
  "file_path": "GLP Documents\GLP_doc_13.pdf",
  "file_path": "GLP Documents\GLP_doc_14.pdf",
  "file_path": "GLP Documents\GLP_doc_15.pdf",
  "file_path": "GLP Documents\GLP_doc_16.pdf",
  "file_path": "GLP Documents\GLP_doc_17.pdf",
  "file_path": "GLP Documents\GLP_doc_18.pdf",
  "file_path": "GLP Documents\GLP_doc_19.pdf",
  "file_path": "GLP Documents\GLP_doc_20.pdf",
  "file_path": "GLP Documents\GLP_doc_21.pdf",
  "file_path": "GLP Documents\GLP_doc_22.pdf",
  "file_path": "GLP D

Extracting texts from GLP_doc_2 by NGCMA...
Finished extracting texts from GLP_doc_2.
Extracting texts from GLP_doc_3 by NGCMA...
Finished extracting texts from GLP_doc_3.
Extracting texts from GLP_doc_4 by NGCMA...
Finished extracting texts from GLP_doc_4.
Extracting texts from GLP_doc_5 by NGCMA...
Finished extracting texts from GLP_doc_5.
Extracting texts from GLP_doc_6 by NGCMA...
Finished extracting texts from GLP_doc_6.
Extracting texts from GLP_doc_7 by NGCMA...
Finished extracting texts from GLP_doc_7.
Extracting texts from GLP_doc_8 by NGCMA...
Finished extracting texts from GLP_doc_8.
Extracting texts from GLP_doc_9 by NGCMA...
Finished extracting texts from GLP_doc_9.
Extracting texts from GLP_doc_10 by NGCMA...
Finished extracting texts from GLP_doc_10.
Extracting texts from GLP_doc_11 by NGCMA...
Finished extracting texts from GLP_doc_11.
Extracting texts from GLP_doc_12 by NGCMA...
Finished extracting texts from GLP_doc_12.
Extracting texts from GLP_doc_13 by NGCMA...
Fin

In [5]:
from llama_index.core import Document

llama_index_docs = [Document(text=doc["text"], metadata=doc["metadata"]) for doc in all_texts]

In [6]:
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.storage import StorageContext

# Create a SimpleDocumentStore and add the documents
docstore = SimpleDocumentStore()
docstore.add_documents(llama_index_docs)

# Create a storage context
storage_context = StorageContext.from_defaults(docstore=docstore)

# Persist the document store to disk
storage_context.persist("data/words-of-the-sequence")

In [59]:
%%capture
%pip install llama-index==0.10.37 cohere==5.5.0 openai==1.30.1 llama-index-embeddings-openai==0.1.9 qdrant-client==1.9.1 llama-index-vector-stores-qdrant==0.2.8 llama-index-llms-cohere==0.2.0

In [None]:
#!pip install datasets
#!pip install llama_index.embeddings.fastembed 

In [7]:
import random
import time
from datasets import Dataset
from tqdm import tqdm
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#pip install python-dotenv


In [None]:
#!pip install llama-index
#!pip install cohere


In [None]:
#pip install llama_index.embeddings.cohere 


In [None]:
#!pip install llama_index.llms.mistralai 

In [None]:
#pip install --upgrade llama-index openai pydantic


In [None]:
#pip install --upgrade llama-index cohere pydantic


In [8]:
import os
import sys
from getpass import getpass
import nest_asyncio
import llama_index.embeddings.cohere 


from IPython.display import Markdown, display

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv()

#sys.path.append('../helpers')

from llama_def import setup_llm, setup_embed_model, setup_vector_store

In [9]:
CO_API_KEY = os.environ['CO_API_KEY'] or getpass("Enter your Cohere API key: ")

In [10]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] or getpass("Enter your OpenAI API key: ")

In [11]:
QDRANT_URL = os.environ['QDRANT_URL'] or getpass("Enter your Qdrant URL:")

In [12]:
QDRANT_API_KEY = os.environ['QDRANT_API_KEY'] or  getpass("Enter your Qdrant API Key:")

In [13]:
from llama_index.core.settings import Settings
from llama_def import setup_llm, setup_embed_model, setup_vector_store

COLLECTION_NAME = "words-of-the-sequence"

setup_llm(
    provider="cohere", 
    model="command-r-plus", 
    api_key=CO_API_KEY
    )

setup_embed_model(
    provider="openai", 
    model_name="text-embedding-3-large",
    api_key=OPENAI_API_KEY
    )

vector_store = setup_vector_store(QDRANT_URL, QDRANT_API_KEY, COLLECTION_NAME)

Both client and aclient are provided. If using `:memory:` mode, the data between clients is not synced.


In [14]:
from llama_def import get_documents_from_docstore

documents = get_documents_from_docstore("data/words-of-the-sequence")

In [15]:
from llama_def import get_documents_from_docstore

documents = get_documents_from_docstore("data/words-of-the-sequence")

In [16]:
from llama_index.core.constants import DEFAULT_CHUNK_SIZE
from llama_index.core.node_parser.text import SentenceSplitter
from llama_index.core import StorageContext
from llama_def import ingest

print(f"This is the chunk size: {DEFAULT_CHUNK_SIZE}")

tranforms = [
    SentenceSplitter(chunk_size=DEFAULT_CHUNK_SIZE), 
    Settings.embed_model
    ]

nodes = ingest(
    documents=documents,
    transformations=tranforms,
    vector_store=vector_store,
)

This is the chunk size: 1024


In [17]:
from llama_def import create_index, create_query_engine

storage_context = StorageContext.from_defaults(
    vector_store=vector_store
    )

index = create_index(
    from_where="vector_store",
    embed_model=Settings.embed_model, 
    vector_store=vector_store, 
    # storage_context=storage_context
    )

query_engine = create_query_engine(
    index=index, 
    mode="query",
    # llm=Settings.llm
    )

In [18]:
from llama_def import create_query_pipeline

from llama_index.core.query_pipeline import InputComponent

input_component = InputComponent()

chain = [input_component, query_engine]

query_pipeline = create_query_pipeline(chain)

In [21]:
response = query_pipeline.run(input='How should the frequency of QA audits be determined?')

[1;3;38;2;155;135;227m> Running module 34176422-4338-4854-aa3a-f0e474a7100e with input: 
input: How should the frequency of QA audits be determined?

[0m[1;3;38;2;155;135;227m> Running module 9a3256be-d287-405d-b4a8-b89ef904339c with input: 
input: How should the frequency of QA audits be determined?

[0m

In [22]:
print(response)

The frequency of QA audits should be determined by the type of inspection being carried out and the associated risks. A risk-based approach allows QA personnel to determine the type of inspection, when to carry it out, and how to allocate resources effectively.
