In [4]:
! pip install azure-core
! pip install openai
! pip install azure-search-documents --pre
! pip install PyPDF2
! pip install tenacity

Collecting azure-search-documents
  Obtaining dependency information for azure-search-documents from https://files.pythonhosted.org/packages/25/f4/ec7c1d6bafb037d3017db93ef44e18efe84e6d4e7b8906153a9bb777786e/azure_search_documents-11.4.0b8-py3-none-any.whl.metadata
  Using cached azure_search_documents-11.4.0b8-py3-none-any.whl.metadata (22 kB)
Using cached azure_search_documents-11.4.0b8-py3-none-any.whl (305 kB)
Installing collected packages: azure-search-documents
Successfully installed azure-search-documents-11.4.0b8


In [3]:
! pip uninstall azure-search-documents -y

Found existing installation: azure-search-documents 11.4.0b8
Uninstalling azure-search-documents-11.4.0b8:
  Successfully uninstalled azure-search-documents-11.4.0b8


In [7]:
import json
import openai
import os
import sys
import time
import PyPDF2
import fnmatch
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import SearchIndex
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    ComplexField,
    CorsOptions,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SemanticSettings,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    ScoringProfile,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
)

In [3]:
# Load secrets and config from .env file
load_dotenv()

# OpenAI API
openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai.api_key = os.getenv("OPENAI_API_KEY")
embedding_model = os.getenv("OPENAI_EMBEDDING_MODEL")
print("OpenAI API key: {}".format(openai.api_key[:5] + '...' + openai.api_key[-5:]))
print("OpenAI API base: {}".format(openai.api_base))
print("OpenAI API version: {}".format(openai.api_version))
print("OpenAI API type: {}".format(openai.api_type))

# Azure Search API
search_service_name = os.getenv("SEARCH_SERVICE_NAME")
search_service_key = os.getenv("SEARCH_SERVICE_KEY")
search_index_name = os.getenv("SEARCH_INDEX_NAME")
search_endpoint = "https://{}.search.windows.net/".format(search_service_name)
print("Azure Search service name: {}".format(search_service_name))
print("Azure Search service key: {}".format(search_service_key[:5] + '...' + search_service_key[-5:]))
print("Azure Search index name: {}".format(search_index_name))
print("Azure Search endpoint: {}".format(search_endpoint))

# Other variables
data_directory = os.getenv("FILEPATH_TO_DATA")
index_schema = os.getenv("FILEPATH_TO_INDEX_SCHEMA")
print("Data directory: {}".format(data_directory))
print("Index schema: {}".format(index_schema))

OpenAI API key: 1c7f2...3613f
OpenAI API base: https://wsl-openai-canada.openai.azure.com/
OpenAI API version: 2023-03-15-preview
OpenAI API type: azure
Azure Search service name: wsl-cog-search-test-2
Azure Search service key: 9GUM9...z1hrb
Azure Search index name: cogsrch-index
Azure Search endpoint: https://wsl-cog-search-test-2.search.windows.net/
Data directory: E:/AI/PDFs/Colbun/
Index schema: ./index_schema.json


In [5]:
# Instantiate a client
class CreateClient(object):
    def __init__(self, endpoint, key, index_name):
        self.endpoint = endpoint
        self.index_name = index_name
        self.key = key
        self.credentials = AzureKeyCredential(key)

    # Create a SearchClient
    # Use this to upload docs to the Index
    def create_search_client(self):
        return SearchClient(
            endpoint=self.endpoint,
            index_name=self.index_name,
            credential=self.credentials,
        )

    # Create a SearchIndexClient
    # This is used to create, manage, and delete an index
    def create_admin_client(self):
        return SearchIndexClient(endpoint=self.endpoint, credential=self.credentials)

In [6]:
# Create Search Index from the schema
# If reading the schema from a URL, set url=True
def create_schema_from_json_and_upload(schema, index_name, admin_client):
    # Check if the file exists
    if not os.path.isfile(schema):
        raise ValueError(f"Schema file '{schema}' does not exist")
    
    # Open the file and load the schema data
    with open(schema) as json_file:
        schema_data = json.load(json_file)
    
    cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
    scoring_profiles = []

    index = SearchIndex(
        name=index_name,
        fields=schema_data["fields"],
        scoring_profiles=scoring_profiles,
        suggesters=schema_data["suggesters"],
        cors_options=cors_options,
    )

    try:
        upload_schema = admin_client.create_index(index)
        if upload_schema:
            print(f"Schema uploaded; Index created for {index_name}.")
        else:
            exit(0)
    except:
        print("Unexpected error:", sys.exc_info()[0])

In [7]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(5))
def generate_embeddings(text):
    response = openai.Embedding.create(input=text, engine=embedding_model)
    embeddings = response["data"][0]['embedding']
    return embeddings

In [1]:
def convert_date_to_iso8601(date):
    try:
        date = date.strftime("%Y-%m-%dT%H:%M:%SZ")
    except:
        print("Error converting date to ISO8601 format")
        date = None
    return date

In [9]:
def convert_pdfs_and_upload_to_index(root_dir, client):
    data_list = []
    id_count = 1
    total_files = sum(len(files) for _, _, files in os.walk(root_dir))
    count = 0
    
    for dirpath, dirs, files in os.walk(root_dir):
        for filename in fnmatch.filter(files, '*.pdf'):
            count += 1
            print(f"Processing {filename} {count}/{total_files}")
            pdf_file = os.path.join(dirpath, filename)
            
            with open(pdf_file, 'rb') as fileobj:
                pdf = PyPDF2.PdfReader(fileobj)
                info = pdf.metadata
                author = info.author
                created_date = info.creation_date
                mod_date = info.modification_date
                num_pages = len(pdf.pages)

                # loop through each page of the PDF
                for i in range(num_pages):
                    text = pdf.pages[i].extract_text()
                    
                    try:
                        embeddings = generate_embeddings(text)
                    except:
                        print(f"Error generating embeddings for {pdf_file}")
                        continue

                    data = {
                        "id": str(id_count),
                        "filename": os.path.basename(pdf_file),
                        "author": author,
                        "page_number": i+1,
                        "total_pages": num_pages,
                        "content": text,
                        "embeddings": embeddings
                    }
                    
                    data["created_date"] = convert_date_to_iso8601(created_date)
                    if not data["created_date"]:
                        continue
                    
                    data["last_modified_date"] = convert_date_to_iso8601(mod_date)
                    if not data["last_modified_date"]:
                        continue

                    data_list.append(data)
                    id_count += 1
                
    client.upload_documents(documents=data_list)
    print("Done!")

In [13]:
schema = create_schema_from_json_and_upload(index_schema, search_index_name, admin_client)
convert_pdfs_and_upload_to_index(data_directory, search_client)
print("Upload complete")

Schema uploaded; Index created for cogsrch-index.
Processing DE03587-23.pdf 1/100
Processing DE03590-23.pdf 2/100
Processing DE03591-23.pdf 3/100
Processing DE03592-23.pdf 4/100
Processing DE03596-23.pdf 5/100
Processing DE03597-23.pdf 6/100
Processing DE03598-23.pdf 7/100
Processing DE03599-23.pdf 8/100
Processing DE03610-23.pdf 9/100
Processing DE03612-23.pdf 10/100
Processing DE03613-23.pdf 11/100
Error converting created_date for E:/AI/PDFs/Colbun/DE03613-23.pdf
Error converting created_date for E:/AI/PDFs/Colbun/DE03613-23.pdf
Processing DE03616-23.pdf 12/100
Processing DE03617-23.pdf 13/100
Processing DE03634-23.pdf 14/100
Processing DE03641-23.pdf 15/100
Processing DE03642-23.pdf 16/100
Processing DE03643-23.pdf 17/100
Processing DE03644-23.pdf 18/100
Processing DE03646-23.pdf 19/100
Processing DE03647-23.pdf 20/100
Processing DE03655-23.pdf 21/100
Processing DE03661-23.pdf 22/100
Processing DE03663-23.pdf 23/100
Processing DE03664-23.pdf 24/100
Error converting created_date for

In [13]:
# Clean up Azure resources
admin_client.delete_index(search_index_name)
print("Index deleted")


Index deleted
