## This book Upload data into AI Search 
1. read csv file  into data frame 
2. Split csv into wanted format and save in dataframe
2. read rows of dataframe into list of dictionaries
3. Upload into the AI search index. 

It is important to note multiple csv with the same column headers and numbers can be read and aggregated into one dataframe and a bulk upload can be done into Azure AI search. However, this is notebook uploads each csv into AI Search. 


In [10]:
#import libraries
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
import json
import pandas as pd
from dotenv import dotenv_values
from azure.search.documents.indexes import SearchIndexClient
import requests
import re


In [11]:
#import Azure AI Search credentials (replace with personal credentials)
config = dotenv_values('/Users/pelumioluwaabiola/Desktop/Transcriptions/credential.env')
ai_search_location = config['ai_search_location'] 
ai_search_key = config['ai_search_key']
ai_search_url = config['ai_search_url']
ai_search_index = 'oewg-speech-meeeting-index'
ai_search_name = 'aicpcigi'
embedding_length = 768

In [154]:
#path of csv file . (change the path to upload personal csv file)
csv_file = '/Users/pelumioluwaabiola/Desktop/Transcriptions/csv/Meeting_7_Session_3.csv'


In [156]:
df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,Id,Session,Meeting,Speaker,Text
0,S3M71,3,7,Chairman,Good morning. Your Excellencies Distinguished ...
1,S3M72,3,7,Russia,By Shiva. Thank you Mr. Chairman. Before movin...
2,S3M73,3,7,Chairman,I thank the distinguished representative of th...
3,S3M74,3,7,Canada,"Thank you, Mr. Chair. Good morning, everyone. ..."
4,S3M75,3,7,Chairman,I thank the distinguished representative of Ca...


In [135]:
#define the text splitter
def chucking_text(text):
    textsplitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=128,
        length_function=len,
        is_separator_regex=False,
    )
    TextChunks = textsplitter.split_text(text)
    return TextChunks

In [157]:

# Initialize an empty DataFrame to store chunked data
df_chunks = pd.DataFrame(columns=['Id', 'Session', 'Meeting', 'Speaker', 'Text'])

# Iterate through each row of the original DataFrame
for index, row in df.iterrows():
    text = row['Text']
    id_ = row['Id']
    session = row['Session']
    meeting = row['Meeting']
    speaker = row['Speaker']
    
    # Chunk the text
    chunks = chucking_text(text)
    
    # Create chunk IDs ensuring uniqueness
    chunk_ids = [f'{id_}_{i+1}' for i in range(len(chunks))]  # Use index to ensure uniqueness
    
    # Expand session, meeting, and speaker lists to match the number of chunks
    session = [session] * len(chunks)
    meeting = [meeting] * len(chunks)
    speaker = [speaker] * len(chunks)
    
    # Create a DataFrame for the chunks of the current row
    df_temp = pd.DataFrame({
        'Id': chunk_ids,
        'Session': session,
        'Meeting': meeting,
        'Speaker': speaker,
        'Text': chunks
    })
    
    # Append the chunked data to the main DataFrame
    df_chunks = pd.concat([df_chunks, df_temp])

# Reset the index of the resulting DataFrame
df_chunks.reset_index(drop=True, inplace=True)

# Print the resulting DataFrame
df_chunks.head()


Unnamed: 0,Id,Session,Meeting,Speaker,Text
0,S3M71_1,3,7,Chairman,Good morning. Your Excellencies Distinguished ...
1,S3M71_2,3,7,Chairman,"legally binding instruments, including on the ..."
2,S3M71_3,3,7,Chairman,and the secretariat will inscribe you in the l...
3,S3M72_1,3,7,Russia,By Shiva. Thank you Mr. Chairman. Before movin...
4,S3M72_2,3,7,Russia,reference to the Antarctic Treaty. Article 1 t...


In [158]:
#check rows with duplicate values in id
duplicate_rows = df_chunks[df_chunks['Id'].duplicated(keep=False)]
duplicate_rows

Unnamed: 0,Id,Session,Meeting,Speaker,Text


In [159]:
#convert data to vector embeddings
def generate_embeddings(text):
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    embeddings = embedding_model.embed_query(text)
    return embeddings


In [161]:
#create a new column in the df_chunks with the embeddings and add the embeded each text 
df_chunks['TextEmbeddings'] = df_chunks['Text'].apply(generate_embeddings)

In [162]:
df_chunks.head()

Unnamed: 0,Id,Session,Meeting,Speaker,Text,TextEmbeddings
0,S3M71_1,3,7,Chairman,Good morning. Your Excellencies Distinguished ...,"[0.07009992748498917, -0.005061207339167595, 0..."
1,S3M71_2,3,7,Chairman,"legally binding instruments, including on the ...","[0.06273496150970459, -0.03469785302877426, 0...."
2,S3M71_3,3,7,Chairman,and the secretariat will inscribe you in the l...,"[0.0334402434527874, 0.04423074796795845, -0.0..."
3,S3M72_1,3,7,Russia,By Shiva. Thank you Mr. Chairman. Before movin...,"[0.07069221884012222, -0.0010275797685608268, ..."
4,S3M72_2,3,7,Russia,reference to the Antarctic Treaty. Article 1 t...,"[0.08134905248880386, -0.02174389734864235, 0...."


In [163]:
#save df_chunks as a csv file #change name of file s(number of session)m(number of meeting).csv
df_chunks.to_csv('/Users/pelumioluwaabiola/Desktop/Transcriptions/session2/s3m7.csv', index=False)

In [164]:
#read row by row and append to a list
data = []
for index, row in df_chunks.iterrows():
    row_dict = {}
    row_dict['id'] = row['Id']
    row_dict['Session'] = str(row['Session'])
    row_dict['Meeting'] = str(row['Meeting'])
    row_dict['Speaker'] = row['Speaker']
    row_dict['Text'] = row['Text']
    #convert text embeddings to list
    row_dict['TextEmbeddings'] = row['TextEmbeddings']
    row_dict['@search.action'] = 'upload'
    data.append(row_dict)

In [165]:
#view first element of data
data[0]

{'id': 'S3M71_1',
 'Session': '3',
 'Meeting': '7',
 'Speaker': 'Chairman',
 'Text': 'Good morning. Your Excellencies Distinguished colleagues, please take your seats. Good morning to all. I would now like to continue our work under agenda item 6C, which is to make recommendations and possible norms, rules and principles of responsible behaviours relating to threats by states of space systems, including, as appropriate, how they would contribute to the negotiation of legally binding instruments, including on the prevention of an Armstrong space. This morning  we will begin our discussion on',
 'TextEmbeddings': [0.07009992748498917,
  -0.005061207339167595,
  0.03111899271607399,
  -0.014947964809834957,
  -0.04400147870182991,
  -0.04556109383702278,
  0.011527584865689278,
  -0.017631322145462036,
  0.04492302983999252,
  0.010062308050692081,
  0.05738275498151779,
  0.0013981525553390384,
  -0.04021257162094116,
  0.0550345741212368,
  0.03337835520505905,
  -0.04739956930279732,
 

## upload data to Azure AI Search

In [16]:
#connect to AI search 
index_client = SearchIndexClient(endpoint=ai_search_url, credential=ai_search_key)

In [166]:

def insert_into_index(documents):
    """Uploads a list of 'documents' to Azure AI Search index."""

    url = f"https://{ai_search_name}.search.windows.net/indexes/{ai_search_index}/docs/index?api-version=2023-11-01"

    payload = json.dumps({"value": documents})
    headers = {
        "Content-Type": "application/json",
        "api-key": ai_search_key,
    }

    response = requests.request("POST", url, headers=headers, data=payload)

    if response.status_code == 200 or response.status_code == 201:
        return "Success"
    else:
        return f"Failure: {response.text}"

def make_safe_id(row_id: str):
    """Strips disallowed characters from row id for use as Azure AI search document ID."""
    return re.sub("[^0-9a-zA-Z_-]", "_", row_id)

#upload data in batchs of 1000
BATCH_SIZE = 1000
for i in range(0, len(data), BATCH_SIZE):
    batch = data[i:i+BATCH_SIZE]
    status = insert_into_index(batch)
    print(f'upload status: {status}')


upload status: Success
