## Install Dependencies

In [2]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


## Import and Setup Credentials

In [3]:
import argparse
import base64
import glob
import html
import io
import os
import re
import time

import openai
from azure.core.credentials import AzureKeyCredential
from azure.identity import AzureDeveloperCliCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import *
from azure.storage.blob import BlobServiceClient
from pypdf import PdfReader, PdfWriter
from tenacity import retry, stop_after_attempt, wait_random_exponential
from dotenv import load_dotenv

In [None]:
load_dotenv(dotenv_path="../.env")
openai.api_type = "azure"
openai.api_key = os.getenv("AZURE_OPENAI_KEY")

## Setup search Index

In [None]:
search_endpoint = f"https://{os.getenv('AZURE_SEARCH_ENDPOINT')}.search.windows.net/"
search_creds = AzureKeyCredential(os.getenv("AZURE_SEARCH_KEY"))
index_client = SearchIndexClient(endpoint= search_endpoint, credential=search_creds)

In [None]:
index = SearchIndex(
            name=os.getenv("AZURE_SEARCH_INDEX"),
            fields=[
                SimpleField(name="id", type="Edm.String", key=True),
                SearchableField(name="content", type="Edm.String", analyzer_name="en.microsoft"),
                SearchField(name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
                            hidden=False, searchable=True, filterable=False, sortable=False, facetable=False,
                            vector_search_dimensions=1536, vector_search_configuration="default"),
                SimpleField(name="sourcepage", type="Edm.String", filterable=True, facetable=True),
                SimpleField(name="sourcefile", type="Edm.String", filterable=True, facetable=True)
            ],
            semantic_settings=SemanticSettings(
                configurations=[SemanticConfiguration(
                    name='default',
                    prioritized_fields=PrioritizedFields(title_field=None, prioritized_content_fields=[SemanticField(field_name='content')]))]),
                vector_search=VectorSearch(
                    algorithm_configurations=[
                        VectorSearchAlgorithmConfiguration(
                            name="default",
                            kind="hnsw",
                            hnsw_parameters=HnswParameters(metric="cosine") 
                        )
                    ]
                )        
            )
index_client.create_index(index)

## Extract data from documents using form recognizer

In [None]:
def get_document_text(filename):
    offset = 0
    page_map = []
    
    print(f"Extracting text from '{filename}' using PdfReader")
    
    reader = PdfReader(filename)
    pages = reader.pages
    for page_num, p in enumerate(pages):
        page_text = p.extract_text()
        page_map.append((page_num, offset, page_text))
        offset += len(page_text)
    
    return page_map

## Section of extracted text

In [4]:
def filename_to_id(filename): 
    filename_ascii = re.sub("[^0-9a-zA-Z_-]", "_", filename)
    filename_hash = base64.b16encode(filename.encode('utf-8')).decode('ascii')
    return f"file-{filename_ascii}-{filename_hash}"

filename_to_id("test.pdf")

'file-test_pdf-746573742E706466'