## This notebook creates search index of the OEWG Space Threat meetings on Azure AI Search
The following steps were implemented
1. Connected to Azure AI search 
2. Created an index with id name 'oewg-speech-meeeting-index'
4. Loaded the csv file and Checked to make sure there are no duplicate values in id column
5. Saved the csv file with our data into a list of dictionaries (dictionary keys are the column names)
6. upload the data into the index as new items

In [1]:
from dotenv import dotenv_values
from azure.search.documents.indexes import SearchIndexClient
import requests
import json
import re
import pandas as pd



In [2]:
#import ai search credentials 
config = dotenv_values('/Users/pelumioluwaabiola/Desktop/Transcriptions/credential.env')
ai_search_location = config['ai_search_location']
ai_search_key = config['ai_search_key']
ai_search_url = config['ai_search_url']
ai_search_index = 'oewg-speech-meeeting-index'
ai_search_name = 'aicpcigi'
embedding_length = 768

In [3]:
#connect to AI search 
index_client = SearchIndexClient(endpoint=ai_search_url, credential=ai_search_key)

In [5]:
# Length of the embedding vector 
EMBEDDING_LENGTH = 768

# Create index for AI Search with fields 
# Note the datatypes for each field below
url = f"https://{ai_search_name}.search.windows.net/indexes/{ai_search_index}?api-version=2023-11-01"
payload = json.dumps(
    {
        "name": ai_search_index,
        "fields": [
            # Unique identifier for each document
            {
                "name": "id",
                "type": "Edm.String",
                "key": True,
                "filterable": True,
                "sortable": True,
                "retrievable": True,
            },
            # session name of the document
            {
                "name": "Session",
                "type": "Edm.String",
                "filterable": True,
                "sortable": True,
                "retrievable": True,
            },
            # meeting number of the document
            {
                "name": "Meeting",
                "type": "Edm.String",
                "filterable": True,
                "sortable": True,
                "retrievable": True,
            },
            # name of speaker
            {
                "name": "Speaker",
                "type": "Edm.String",
                "filterable": True,
                "searchable": True,
                "sortable": True,
                "retrievable": True,
            },
            #Speech Text
            {
                "name": "Text",
                "type": "Edm.String",
                "filterable": True,
                "sortable": True,
                "searchable": True,
                "retrievable": True,
            },
            #Cluster Label
            {
                "name": "ClusterLabel",
                "type": "Edm.String",
                "filterable": True,
                "sortable": True,
                "searchable": True,
                "retrievable": True,
            },
            # Vector embedding of the text content
            {
                "name": "TextEmbeddings",
                "type": "Collection(Edm.Single)",
                "searchable": True,
                "retrievable": True,
                "dimensions": EMBEDDING_LENGTH,
                "vectorSearchProfile": "vectorConfig",
            },
        ],
        "vectorSearch": {
            "algorithms": [{"name": "hnswConfig", "kind": "hnsw", "hnswParameters": {"metric": "cosine"}}],
            "profiles": [{"name": "vectorConfig", "algorithm": "hnswConfig"}],
        },
    }
)
headers = {"Content-Type": "application/json", "api-key": ai_search_key}

response = requests.request("PUT", url, headers=headers, data=payload)
if response.status_code == 201:
    print("Index created!")
elif response.status_code == 204:
    print("Index updated!")
else:
    print(f"HTTP request failed with status code {response.status_code}")
    print(f"HTTP response body: {response.text}")

Index created!


In [159]:
#read csv file
df = pd.read_csv('/Users/pelumioluwaabiola/Desktop/Transcriptions/detailed_data/Meeting_5_Session_3.csv')  

In [160]:
#check if there are dulplicate values in id column and print the duplicate rows
duplicate = df[df.duplicated(['id'])]
duplicate

Unnamed: 0,id,Session,Meeting,Speaker,Text,TextEmbeddings,SpeakerEmbeddings


In [172]:
#read row by row and append to a list
data = []
for index, row in df.iterrows():
    row_dict = {}
    row_dict['id'] = row['id']
    row_dict['Session'] = str(row['Session'])
    row_dict['Meeting'] = str(row['Meeting'])
    row_dict['Speaker'] = row['Speaker']
    row_dict['Text'] = row['Text']
    #convert text embeddings to list
    row_dict['TextEmbeddings'] = json.loads(row['TextEmbeddings'])
    row_dict['SpeakerEmbeddings'] = json.loads(row['SpeakerEmbeddings'])
    row_dict['@search.action'] = 'upload'
    data.append(row_dict)

In [178]:

def insert_into_index(documents):
    """Uploads a list of 'documents' to Azure AI Search index."""

    url = f"https://{ai_search_name}.search.windows.net/indexes/{ai_search_index}/docs/index?api-version=2023-11-01"

    payload = json.dumps({"value": documents})
    headers = {
        "Content-Type": "application/json",
        "api-key": ai_search_key,
    }

    response = requests.request("POST", url, headers=headers, data=payload)

    if response.status_code == 200 or response.status_code == 201:
        return "Success"
    else:
        return f"Failure: {response.text}"

def make_safe_id(row_id: str):
    """Strips disallowed characters from row id for use as Azure AI search document ID."""
    return re.sub("[^0-9a-zA-Z_-]", "_", row_id)

#upload data in batchs of 1000
BATCH_SIZE = 1000
for i in range(0, len(data), BATCH_SIZE):
    batch = data[i:i+BATCH_SIZE]
    status = insert_into_index(batch)
    print(f'upload status: {status}')


upload status: Success
