## Install Dependencies

In [2]:
%pip install -r requirements.txt

[33mDEPRECATION: omegaconf 2.0.6 has a non-standard dependency specifier PyYAML>=5.1.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of omegaconf or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


## Import

In [3]:
import base64
import os
import re

import openai

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import *
from pypdf import PdfReader, PdfWriter

import dotenv 
#load the environment variables of .env file
%load_ext dotenv
%dotenv


## Setup search Index

In [None]:
# Setup the required credentials for using Azure cognitive search
search_endpoint = f"https://{os.getenv('AZURE_SEARCH_ENDPOINT')}.search.windows.net/"
search_creds = AzureKeyCredential(os.getenv("AZURE_SEARCH_KEY"))
index_client = SearchIndexClient(endpoint= search_endpoint, credential=search_creds)

# Define a search index
index = SearchIndex(
            name=os.getenv("AZURE_SEARCH_INDEX"),
            fields=[
                SimpleField(name="id", type="Edm.String", key=True),
                SearchableField(name="content", type="Edm.String", analyzer_name="en.microsoft"),
                SearchField(name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
                            hidden=False, searchable=True, filterable=False, sortable=False, facetable=False,
                            vector_search_dimensions=1536, vector_search_configuration="default"),
                SimpleField(name="sourcepage", type="Edm.String", filterable=True, facetable=True),
                SimpleField(name="sourcefile", type="Edm.String", filterable=True, facetable=True)
            ],
            semantic_settings=SemanticSettings(
                configurations=[SemanticConfiguration(
                    name='default',
                    prioritized_fields=PrioritizedFields(title_field=None, prioritized_content_fields=[SemanticField(field_name='content')]))]),
                vector_search=VectorSearch(
                    algorithm_configurations=[
                        VectorSearchAlgorithmConfiguration(
                            name="default",
                            kind="hnsw",
                            hnsw_parameters=HnswParameters(metric="cosine") 
                        )
                    ]
                )        
            )

# Create the search index
index_client.create_index(index)

## Setup Embedding model

In [None]:
# Setup the required credential for using Azure OpenAI
openai.api_type = "azure"
openai.api_key = os.getenv("AZURE_OPENAI_KEY")        
openai_Service = os.getenv("AZURE_OPENAI_SERVICE")
openai.api_base = f"https://{openai_Service}.openai.azure.com"
openai.api_version = "2023-05-15"

def compute_embedding(text):
    return openai.Embedding.create(engine="embedding", input=text)["data"][0]["embedding"]

## Extract data from documents

In [None]:
filename="./data/" + " " #your file name

offset = 0
page_map = []

print(f"Extracting text from '{filename}' using PdfReader")

reader = PdfReader(filename)
pages = reader.pages
for page_num, p in enumerate(pages):
    page_text = p.extract_text()
    page_map.append((page_num, offset, page_text))
    offset += len(page_text)
    
page_map

## Section the extracted text

In [None]:
MAX_SECTION_LENGTH = 1000
SENTENCE_SEARCH_LIMIT = 100
SECTION_OVERLAP = 100


def filename_to_id(filename): 
    filename_ascii = re.sub("[^0-9a-zA-Z_-]", "_", filename)
    filename_hash = base64.b16encode(filename.encode('utf-8')).decode('ascii')
    return f"file-{filename_ascii}-{filename_hash}"

def split_text(page_map):
    SENTENCE_ENDINGS = [".", "!", "?"]
    WORDS_BREAKS = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]

    def find_page(offset):
        l = len(page_map)
        for i in range(l - 1):
            if offset >= page_map[i][1] and offset < page_map[i + 1][1]:
                return i
        return l - 1

    all_text = "".join(p[2] for p in page_map)
    length = len(all_text)
    start = 0
    end = length
    while start + SECTION_OVERLAP < length:
        last_word = -1
        end = start + MAX_SECTION_LENGTH

        if end > length:
            end = length
        else:
            # Try to find the end of the sentence
            while end < length and (end - start - MAX_SECTION_LENGTH) < SENTENCE_SEARCH_LIMIT and all_text[end] not in SENTENCE_ENDINGS:
                if all_text[end] in WORDS_BREAKS:
                    last_word = end
                end += 1
            if end < length and all_text[end] not in SENTENCE_ENDINGS and last_word > 0:
                end = last_word # Fall back to at least keeping a whole word
        if end < length:
            end += 1

        # Try to find the start of the sentence or at least a whole word boundary
        last_word = -1
        while start > 0 and start > end - MAX_SECTION_LENGTH - 2 * SENTENCE_SEARCH_LIMIT and all_text[start] not in SENTENCE_ENDINGS:
            if all_text[start] in WORDS_BREAKS:
                last_word = start
            start -= 1
        if all_text[start] not in SENTENCE_ENDINGS and last_word > 0:
            start = last_word
        if start > 0:
            start += 1

        section_text = all_text[start:end]
        yield (section_text, find_page(start))

        last_table_start = section_text.rfind("<table")
        if (last_table_start > 2 * SENTENCE_SEARCH_LIMIT and last_table_start > section_text.rfind("</table")):
            # If the section ends with an unclosed table, we need to start the next section with the table.
            # If table starts inside SENTENCE_SEARCH_LIMIT, we ignore it, as that will cause an infinite loop for tables longer than MAX_SECTION_LENGTH
            # If last table starts inside SECTION_OVERLAP, keep overlapping
            start = min(end - SECTION_OVERLAP, start + last_table_start)
        else:
            start = end - SECTION_OVERLAP
        
    if start + SECTION_OVERLAP < end:
        yield (all_text[start:end], find_page(start))

In [None]:
sections = []
file_id = filename_to_id(filename)
for i, (content, pagenum) in enumerate(split_text(page_map)):
    section = {
        "id": f"{file_id}-page-{i}",
        "content": content,
        "embedding": compute_embedding(content),
        "sourcepage": os.path.splitext(os.path.basename(filename))[0] + f"-{pagenum}" + ".pdf",
        "sourcefile": filename
    }
    sections.append(section)

## Index sections

In [None]:
search_client = SearchClient(endpoint=search_endpoint,
                                    index_name=os.getenv("AZURE_SEARCH_INDEX"),
                                    credential=search_creds)
i = 0
batch = []
#index 1000 sections at a time
for s in sections:
    batch.append(s)
    i += 1
    if i % 1000 == 0:
        results = search_client.upload_documents(documents=batch)
        succeeded = sum([1 for r in results if r.succeeded])
        print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")
        batch = []
        
#index the remaining sections
if len(batch) > 0:
    results = search_client.upload_documents(documents=batch)
    succeeded = sum([1 for r in results if r.succeeded])
    print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")


## Split PDF into blobs

In [None]:
reader = PdfReader(filename)
pages = reader.pages
for i in range(len(pages)):
    blob_name = os.path.splitext(os.path.basename(filename))[0] + f"-{i}" + ".pdf"
    print(f"\tCreating blob for page {i} -> {blob_name}")
    writer = PdfWriter()
    writer.add_page(pages[i])
    writer.write("../DataBase/"+blob_name)
    writer.close()

## Searching the Index

In [None]:
query = " " #your query keywords
query_vector = compute_embedding(query)

def nonewlines(s: str) -> str:
    return s.replace('\n', ' ').replace('\r', ' ')

r = search_client.search(query, 
                        top=3, 
                        vector=query_vector, 
                        top_k=50, 
                        vector_fields="embedding")

results = [doc["sourcepage"] + ": " + nonewlines(doc["content"]) for doc in r]

for result in results:
    print(result)