In [1]:
! pip install azure-core
! pip install openai
! pip install azure-search-documents --pre
! pip install PyPDF2
! pip install tenacity
! pip install openai[datalib]



In [3]:
import json
import openai
import os
import sys
import time
import PyPDF2
import fnmatch
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import SearchIndex
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    ComplexField,
    CorsOptions,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SemanticSettings,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    ScoringProfile,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
)

In [4]:
# Load secrets and config from .env file
load_dotenv()

# OpenAI API
openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai.api_key = os.getenv("OPENAI_API_KEY")
embedding_model = os.getenv("OPENAI_EMBEDDING_MODEL")
print("OpenAI API key: {}".format(openai.api_key[:5] + '...' + openai.api_key[-5:]))
print("OpenAI API base: {}".format(openai.api_base))
print("OpenAI API version: {}".format(openai.api_version))
print("OpenAI API type: {}".format(openai.api_type))

# Azure Search API
search_service_name = os.getenv("SEARCH_SERVICE_NAME")
search_service_key = os.getenv("SEARCH_SERVICE_KEY")
search_index_name = os.getenv("SEARCH_INDEX_NAME")
search_endpoint = "https://{}.search.windows.net/".format(search_service_name)
search_vector_config_name = os.getenv("SEARCH_VECTOR_CONFIG_NAME")
search_semantic_config_name = os.getenv("SEARCH_SEMANTIC_CONFIG_NAME")
print("Azure Search service name: {}".format(search_service_name))
print("Azure Search service key: {}".format(search_service_key[:5] + '...' + search_service_key[-5:]))
print("Azure Search index name: {}".format(search_index_name))
print("Azure Search endpoint: {}".format(search_endpoint))
print("Azure Search vector config name: {}".format(search_vector_config_name))
print("Azure Search semantic config name: {}".format(search_semantic_config_name))

# Other variables
data_directory = os.getenv("FILEPATH_TO_DATA")
index_schema = os.getenv("FILEPATH_TO_INDEX_SCHEMA")
print("Data directory: {}".format(data_directory))
print("Index schema: {}".format(index_schema))


OpenAI API key: 1c7f2...3613f
OpenAI API base: https://wsl-openai-canada.openai.azure.com/
OpenAI API version: 2023-03-15-preview
OpenAI API type: azure
Azure Search service name: wsl-cog-search-test-2
Azure Search service key: 9GUM9...z1hrb
Azure Search index name: cogsrch-index
Azure Search endpoint: https://wsl-cog-search-test-2.search.windows.net/
Azure Search vector config name: test-search-vector-config
Azure Search semantic config name: test-search-semantic-config
Data directory: E:/PDFs
Index schema: ./index_schema.json


In [5]:
# Instantiate a client
class CreateClient(object):
    def __init__(self, endpoint, key, index_name):
        self.endpoint = endpoint
        self.index_name = index_name
        self.key = key
        self.credentials = AzureKeyCredential(key)

    # Create a SearchClient
    # Use this to upload docs to the Index
    def create_search_client(self):
        return SearchClient(
            endpoint=self.endpoint,
            index_name=self.index_name,
            credential=self.credentials,
        )

    # Create a SearchIndexClient
    # This is used to create, manage, and delete an index
    def create_admin_client(self):
        return SearchIndexClient(endpoint=self.endpoint, credential=self.credentials)

In [8]:
# Create Search Index
def create_schema_from_json_and_upload(index_name, admin_client):
    cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
    scoring_profiles = []
    
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
        SearchableField(name="filename", type=SearchFieldDataType.String),
        SearchableField(name="author", type=SearchFieldDataType.String),
        SearchableField(name="created_date", type=SearchFieldDataType.DateTimeOffset),
        SearchableField(name="last_modified_date", type=SearchFieldDataType.DateTimeOffset),
        SearchableField(name="page_number", type=SearchFieldDataType.Int32),
        SearchableField(name="total_pages", type=SearchFieldDataType.Int32),
        SearchableField(name="content", type=SearchFieldDataType.String),
        SearchField(name="embeddings", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_configuration=search_vector_config_name),
    ]
    
    vector_search = VectorSearch(
        algorithm_configurations=[
            HnswVectorSearchAlgorithmConfiguration(
                name=search_vector_config_name,
                kind="hnsw",
                parameters={
                    "m": 4,
                    "efConstruction": 400,
                    "efSearch": 500,
                    "metric": "cosine"
                }
            )
        ]
    )
    
    semantic_config = SemanticConfiguration(
        name=search_semantic_config_name,
        prioritized_fields=PrioritizedFields(
            title_field=SemanticField(field_name="filename"),
            prioritized_content_fields=[SemanticField(field_name="content")]
        )
    )
    
    # Create the semantic settings with the configuration
    semantic_settings = SemanticSettings(configurations=[semantic_config])

    index = SearchIndex(
        name=index_name,
        fields=fields,
        scoring_profiles=scoring_profiles,
        semantic_settings=semantic_settings,
        vector_search=vector_search,
        cors_options=cors_options,
    )

    try:
        upload_schema = admin_client.create_index(index)
        if upload_schema:
            print("Schema uploaded successfully.")
        else:
            raise Exception("Schema upload failed.")
    except:
        raise Exception("Unexpected Error. Schema upload failed.")

In [9]:
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(5))
def generate_embeddings(text):
    try:
        logging.info("Generating embeddings for text of length %d", len(text))
        response = openai.Embedding.create(input=text, engine=embedding_model)
        embeddings = response["data"][0]['embedding']
        return embeddings
    except Exception as e:
        logging.error("Error generating embeddings: %s", str(e))
        raise
        
def convert_date_to_iso8601(date):
    try:
        date = date.strftime("%Y-%m-%dT%H:%M:%SZ")
    except:
        print("Error converting date to ISO8601 format")
        date = None
    return date

In [10]:
def convert_pdfs_and_upload_to_index(root_dir, client):
    data_list = []
    total_files = sum(len(files) for _, _, files in os.walk(root_dir))
    count = 0
    
    for dirpath, dirs, files in os.walk(root_dir):
        for filename in fnmatch.filter(files, '*.pdf'):
            count += 1
            logging.info(f"Processing {filename} {count}/{total_files}")

            print(f"Processing {filename} {count}/{total_files}")
            pdf_file = os.path.join(dirpath, filename)
            
            with open(pdf_file, 'rb') as fileobj:
                pdf = PyPDF2.PdfReader(fileobj)
                info = pdf.metadata
                author = info.author
                created_date = info.creation_date
                mod_date = info.modification_date
                num_pages = len(pdf.pages)

                # loop through each page of the PDF
                for i in range(num_pages):
                    text = pdf.pages[i].extract_text()
                    
                    try:
                        embeddings = generate_embeddings(text)
                    except:
                        logging.error(f"Error generating embeddings for {pdf_file}")

                        print(f"Error generating embeddings for {pdf_file}")
                        continue
                    
                    # create unique id for each page
                    id_str = str(count) + "_" + str(i)
                    
                    data = {
                        "id": id_str,
                        "filename": os.path.basename(pdf_file),
                        "author": author,
                        "page_number": str(i+1),
                        "total_pages": str(num_pages),
                        "content": text,
                        "embeddings": embeddings
                    }
                    
                    data["created_date"] = convert_date_to_iso8601(created_date)
                    if not data["created_date"]:
                        continue
                    
                    data["last_modified_date"] = convert_date_to_iso8601(mod_date)
                    if not data["last_modified_date"]:
                        continue

                    data_list.append(data)
                
    client.upload_documents(documents=data_list)
    print("Done!")

In [11]:
base_client = CreateClient(search_endpoint, search_service_key, search_index_name)
search_client = base_client.create_search_client()
admin_client = base_client.create_admin_client()

In [14]:
schema = create_schema_from_json_and_upload(search_index_name, admin_client)
convert_pdfs_and_upload_to_index(data_directory, search_client)
print("Upload complete")

2023-08-28 17:20:28,461 - INFO - Request URL: 'https://wsl-cog-search-test-2.search.windows.net/indexes?api-version=REDACTED'
Request method: 'POST'
Request headers:
    'Content-Type': 'application/json'
    'Content-Length': '1946'
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=minimal'
    'x-ms-client-request-id': 'b614d4c4-45e8-11ee-beb6-c89402167d5b'
    'User-Agent': 'azsdk-python-search-documents/11.4.0b8 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
A body is sent with the request
2023-08-28 17:20:29,573 - INFO - Response status: 201
Response headers:
    'Transfer-Encoding': 'chunked'
    'Content-Type': 'application/json; odata.metadata=minimal; odata.streaming=true; charset=utf-8'
    'ETag': '"0x8DBA80C9A5EB010"'
    'Location': 'REDACTED'
    'Server': 'Microsoft-IIS/10.0'
    'Strict-Transport-Security': 'REDACTED'
    'Preference-Applied': 'REDACTED'
    'OData-Version': 'REDACTED'
    'request-id': 'b614d4c4-45e8-11ee-beb6-c89402167d5b'
    'elap

Schema uploaded successfully.
Processing DE03234-23.pdf 1/10


NameError: name 'convert_date_to_iso8601' is not defined

In [13]:
# Clean up Azure resources
admin_client.delete_index(search_index_name)
print("Index deleted")

2023-08-28 17:20:25,386 - INFO - Request URL: 'https://wsl-cog-search-test-2.search.windows.net/indexes('cogsrch-index')?api-version=REDACTED'
Request method: 'DELETE'
Request headers:
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=minimal'
    'x-ms-client-request-id': 'b43f9fed-45e8-11ee-a689-c89402167d5b'
    'User-Agent': 'azsdk-python-search-documents/11.4.0b8 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
2023-08-28 17:20:25,739 - INFO - Response status: 204
Response headers:
    'Server': 'Microsoft-IIS/10.0'
    'Strict-Transport-Security': 'REDACTED'
    'Preference-Applied': 'REDACTED'
    'request-id': 'b43f9fed-45e8-11ee-a689-c89402167d5b'
    'elapsed-time': 'REDACTED'
    'Date': 'Mon, 28 Aug 2023 21:20:24 GMT'


Index deleted
