## This notebook creates search index of the OEWG Space Threat meetings on Azure AI Search
The following steps were implemented
1. Connected to Azure AI search 
2. Created an index with id name 'oewg-speech-meeeting-index'
4. Loaded the csv file and Checked to make sure there are no duplicate values in id column
5. Saved the csv file with our data into a list of dictionaries (dictionary keys are the column names)
6. upload the data into the index as new items

In [2]:
from dotenv import dotenv_values
from azure.search.documents.indexes import SearchIndexClient
import requests
import json
import re
import pandas as pd



In [4]:
#import Azure AI Search credentials (replace with personal credentials)
config = dotenv_values('/Users/pelumioluwaabiola/Desktop/Transcriptions/credential.env')
ai_search_location = config['ai_search_location'] 
ai_search_key = config['ai_search_key']
ai_search_url = config['ai_search_url']
ai_search_index = 'oewg-speech-meeeting-index'
ai_search_name = 'aicpcigi'
embedding_length = 768

In [5]:
#connect to AI search 
index_client = SearchIndexClient(endpoint=ai_search_url, credential=ai_search_key)

In [1]:
# Length of the embedding vector 
EMBEDDING_LENGTH = 768

# Create index for AI Search with fields 
# Note the datatypes for each field below
url = f"https://{ai_search_name}.search.windows.net/indexes/{ai_search_index}?api-version=2023-11-01"
payload = json.dumps(
    {
        "name": ai_search_index,
        "fields": [
            # Unique identifier for each document
            {
                "name": "id",
                "type": "Edm.String",
                "key": True,
                "filterable": True,
                "sortable": True,
                "retrievable": True,
            },
            # session name of the document
            {
                "name": "Session",
                "type": "Edm.String",
                "filterable": True,
                "sortable": True,
                "retrievable": True,
            },
            # meeting number of the document
            {
                "name": "Meeting",
                "type": "Edm.String",
                "filterable": True,
                "sortable": True,
                "retrievable": True,
            },
            # name of speaker
            {
                "name": "Speaker",
                "type": "Edm.String",
                "filterable": True,
                "searchable": True,
                "sortable": True,
                "retrievable": True,
            },
            #Speech Text
            {
                "name": "Text",
                "type": "Edm.String",
                "filterable": True,
                "sortable": True,
                "searchable": True,
                "retrievable": True,
            },
            #Speaker cluster label
            {
                "name": "ClusterLabel",
                "type": "Edm.String",
                "filterable": True,
                "sortable": True,
                "searchable": True,
                "retrievable": True,
            },
            
            # Vector embedding of the speaker
            {
                "name": "SpeakerEmbeddings",
                "type": "Collection(Edm.Single)",
                "searchable": True,
                "retrievable": True,
                "dimensions": EMBEDDING_LENGTH,
                "vectorSearchProfile": "vectorConfig",
            },
            # Vector embedding of the text content
            {
                "name": "TextEmbeddings",
                "type": "Collection(Edm.Single)",
                "searchable": True,
                "retrievable": True,
                "dimensions": EMBEDDING_LENGTH,
                "vectorSearchProfile": "vectorConfig",
            },
        ],
        "vectorSearch": {
            "algorithms": [{"name": "hnswConfig", "kind": "hnsw", "hnswParameters": {"metric": "cosine"}}],
            "profiles": [{"name": "vectorConfig", "algorithm": "hnswConfig"}],
        },
    }
)
headers = {"Content-Type": "application/json", "api-key": ai_search_key}

response = requests.request("PUT", url, headers=headers, data=payload)
if response.status_code == 201:
    print("Index created!")
elif response.status_code == 204:
    print("Index updated!")
else:
    print(f"HTTP request failed with status code {response.status_code}")
    print(f"HTTP response body: {response.text}")

NameError: name 'ai_search_name' is not defined

## Upload documents into AI Search Index
1. Read the saved csv from ingestion file. 
2. Read the data frame into list of dictionaries for each row in the dataframe
3. Insert the data into AI search. This inserts data into batches of 1000 (limit of data uploaded at once)

In [6]:
#read csv file
df = pd.read_csv('/Users/pelumioluwaabiola/Desktop/Transcriptions/session2/CleanedMeeting1.csv')  

In [7]:
df.head()

Unnamed: 0,Id,Session,Meeting,Speaker,Text,TextEmbeddings
0,S2M11_1,2,1,Chairman,"Good morning, ladies and gentlemen.Excellencie...","[0.06073196232318878, 0.03783389925956726, 0.0..."
1,S2M11_2,2,1,Chairman,behaviors. But before we proceed with our agen...,"[0.030805662274360657, 0.019306646659970284, 0..."
2,S2M11_3,2,1,Chairman,aspects and proceedings with our substantive w...,"[0.058311160653829575, 0.03847544640302658, 0...."
3,S2M11_4,2,1,Chairman,"of responsible behaviors. During this session,...","[-0.007044042926281691, 0.07030506432056427, 0..."
4,S2M11_5,2,1,Chairman,from the outcome of this session will in large...,"[0.062239743769168854, 0.027549689635634422, 0..."


In [8]:
#check if there are dulplicate values in id column and print the duplicate rows
duplicate = df[df.duplicated(['Id'])]
duplicate

Unnamed: 0,Id,Session,Meeting,Speaker,Text,TextEmbeddings


In [9]:
#read row by row and append to a list
data = []
for index, row in df.iterrows():
    row_dict = {}
    row_dict['id'] = row['Id']
    row_dict['Session'] = str(row['Session'])
    row_dict['Meeting'] = str(row['Meeting'])
    row_dict['Speaker'] = row['Speaker']
    row_dict['Text'] = row['Text']
    #convert text embeddings to list
    row_dict['TextEmbeddings'] = json.loads(row['TextEmbeddings'])
    row_dict['@search.action'] = 'upload'
    data.append(row_dict)

In [10]:
#view first element of data
data[0]

{'id': 'S2M11_1',
 'Session': '2',
 'Meeting': '1',
 'Speaker': 'Chairman',
 'Text': 'Good morning, ladies and gentlemen.Excellencies, distinguished delegates, ladies and gentlemen.I apologize for our late start, but I hope that we can start on time the rest of the week. Kindly take your seats, please.It is my honor and pleasure to declare open the second session of the open-ended working group on reducing space threats through norms, rules, and principles of responsible behaviors. But before we proceed with our agenda, I would like to express to the governments of the United Kingdom, of',
 'TextEmbeddings': [0.06073196232318878,
  0.03783389925956726,
  0.020920174196362495,
  0.010521446354687214,
  -0.025783361867070198,
  -0.009848172776401043,
  0.02830221876502037,
  -0.0006439101416617632,
  0.03423537686467171,
  0.021092427894473076,
  0.007729066535830498,
  -0.013106061145663261,
  -0.05244646221399307,
  -0.0018999157473444939,
  0.06203974783420563,
  -0.06058746948838234,

In [11]:

def insert_into_index(documents):
    """Uploads a list of 'documents' to Azure AI Search index."""

    url = f"https://{ai_search_name}.search.windows.net/indexes/{ai_search_index}/docs/index?api-version=2023-11-01"

    payload = json.dumps({"value": documents})
    headers = {
        "Content-Type": "application/json",
        "api-key": ai_search_key,
    }

    response = requests.request("POST", url, headers=headers, data=payload)

    if response.status_code == 200 or response.status_code == 201:
        return "Success"
    else:
        return f"Failure: {response.text}"

def make_safe_id(row_id: str):
    """Strips disallowed characters from row id for use as Azure AI search document ID."""
    return re.sub("[^0-9a-zA-Z_-]", "_", row_id)

#upload data in batchs of 1000
BATCH_SIZE = 1000
for i in range(0, len(data), BATCH_SIZE):
    batch = data[i:i+BATCH_SIZE]
    status = insert_into_index(batch)
    print(f'upload status: {status}')


upload status: Success
