## This book Upload data into AI Search 
1. read csv file  into data frame 
2. Split csv into wanted format and save in dataframe
2. read rows of dataframe into list of dictionaries
3. Upload into the AI search index. 

It is important to note multiple csv with the same column headers and numbers can be read and aggregated into one dataframe and a bulk upload can be done into Azure AI search. However, this is notebook uploads each csv into AI Search. 


In [10]:
#import libraries
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
import json
import pandas as pd
from dotenv import dotenv_values
from azure.search.documents.indexes import SearchIndexClient
import requests
import re


In [11]:
#import Azure AI Search credentials (replace with personal credentials)
config = dotenv_values('/Users/pelumioluwaabiola/Desktop/Transcriptions/credential.env')
ai_search_location = config['ai_search_location'] 
ai_search_key = config['ai_search_key']
ai_search_url = config['ai_search_url']
ai_search_index = 'oewg-speech-meeeting-index'
ai_search_name = 'aicpcigi'
embedding_length = 768

In [1]:
#path of csv file . (change the path to upload personal csv file)
csv_file = '/Users/pelumioluwaabiola/Desktop/Transcriptions/session2/Meeting2.csv'


In [4]:
df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,Id,Session,Meeting,Speaker,Text
0,S2M21,2,2,Chairman,"Ladies and gentlemen. Distinct colleagues, ple..."
1,S2M22,2,2,Unidir,"Thank you, Chair. Mr. Chair X-17 Distinguished..."
2,S2M23,2,2,Chairman,I thank Missus Oscar at Ortega. For her presen...
3,S2M24,2,2,Cambodia,"So much, Mr. Chair, I have the honor to delive..."
4,S2M25,2,2,Chairman,I thank the distinguished Ambassador of Cambod...


In [5]:
#define the text splitter
def chucking_text(text):
    textsplitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=128,
        length_function=len,
        is_separator_regex=False,
    )
    TextChunks = textsplitter.split_text(text)
    return TextChunks

In [6]:

# Initialize an empty DataFrame to store chunked data
df_chunks = pd.DataFrame(columns=['Id', 'Session', 'Meeting', 'Speaker', 'Text'])

# Iterate through each row of the original DataFrame
for index, row in df.iterrows():
    text = row['Text']
    id_ = row['Id']
    session = row['Session']
    meeting = row['Meeting']
    speaker = row['Speaker']
    
    # Chunk the text
    chunks = chucking_text(text)
    
    # Create chunk IDs ensuring uniqueness
    chunk_ids = [f'{id_}_{i+1}' for i in range(len(chunks))]  # Use index to ensure uniqueness
    
    # Expand session, meeting, and speaker lists to match the number of chunks
    session = [session] * len(chunks)
    meeting = [meeting] * len(chunks)
    speaker = [speaker] * len(chunks)
    
    # Create a DataFrame for the chunks of the current row
    df_temp = pd.DataFrame({
        'Id': chunk_ids,
        'Session': session,
        'Meeting': meeting,
        'Speaker': speaker,
        'Text': chunks
    })
    
    # Append the chunked data to the main DataFrame
    df_chunks = pd.concat([df_chunks, df_temp])

# Reset the index of the resulting DataFrame
df_chunks.reset_index(drop=True, inplace=True)

# Print the resulting DataFrame
df_chunks.head()


Unnamed: 0,Id,Session,Meeting,Speaker,Text
0,S2M21_1,2,2,Chairman,"Ladies and gentlemen. Distinct colleagues, ple..."
1,S2M21_2,2,2,Chairman,second session of the OE WG. The advanced vers...
2,S2M22_1,2,2,Unidir,"Thank you, Chair. Mr. Chair X-17 Distinguished..."
3,S2M22_2,2,2,Unidir,today on behalf of Unity. The objective of thi...
4,S2M22_3,2,2,Unidir,"could be considered irresponsible, as outlined..."


In [7]:
#check rows with duplicate values in id
duplicate_rows = df_chunks[df_chunks['Id'].duplicated(keep=False)]
duplicate_rows

Unnamed: 0,Id,Session,Meeting,Speaker,Text


In [8]:
#convert data to vector embeddings
def generate_embeddings(text):
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    embeddings = embedding_model.embed_query(text)
    return embeddings


In [9]:
#create a new column in the df_chunks with the embeddings and add the embeded each text 
df_chunks['TextEmbeddings'] = df_chunks['Text'].apply(generate_embeddings)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
df_chunks.head()

Unnamed: 0,Id,Session,Meeting,Speaker,Text,TextEmbeddings
0,S2M21_1,2,2,Chairman,"Ladies and gentlemen. Distinct colleagues, ple...","[0.038322482258081436, 0.028815878555178642, 0..."
1,S2M21_2,2,2,Chairman,second session of the OE WG. The advanced vers...,"[-0.057576630264520645, 0.0005984915187582374,..."
2,S2M22_1,2,2,Unidir,"Thank you, Chair. Mr. Chair X-17 Distinguished...","[0.044849034398794174, 0.024158556014299393, 0..."
3,S2M22_2,2,2,Unidir,today on behalf of Unity. The objective of thi...,"[0.06019016355276108, 0.02128327265381813, 0.0..."
4,S2M22_3,2,2,Unidir,"could be considered irresponsible, as outlined...","[0.04841930419206619, -0.015951255336403847, 0..."


In [14]:
#read row by row and append to a list
data = []
for index, row in df_chunks.iterrows():
    row_dict = {}
    row_dict['id'] = row['Id']
    row_dict['Session'] = str(row['Session'])
    row_dict['Meeting'] = str(row['Meeting'])
    row_dict['Speaker'] = row['Speaker']
    row_dict['Text'] = row['Text']
    #convert text embeddings to list
    row_dict['TextEmbeddings'] = row['TextEmbeddings']
    row_dict['@search.action'] = 'upload'
    data.append(row_dict)

In [15]:
#view first element of data
data[0]

{'id': 'S2M21_1',
 'Session': '2',
 'Meeting': '2',
 'Speaker': 'Chairman',
 'Text': 'Ladies and gentlemen. Distinct colleagues, please take your seats. Excellencies, distinguished delegates, ladies and gentlemen. In order to set the scene ahead of our general exchange this afternoon, I would now like to invite. Mrs. MU Venus Karate Ortega of the United Nations Institute for Disarmament Research. Unitir. To present the working paper submitted by Union Deer to the second session of the OE WG. The advanced version of this working paper which bears the symbol a slash AC294/2022 slash Working',
 'TextEmbeddings': [0.038322482258081436,
  0.028815878555178642,
  0.038636110723018646,
  -0.021824078634381294,
  -0.0569051131606102,
  -0.02878367155790329,
  0.0574360154569149,
  0.012233627960085869,
  0.006107884459197521,
  -0.004917769227176905,
  0.018602050840854645,
  0.07317283004522324,
  0.013165030628442764,
  0.006912741810083389,
  0.046217139810323715,
  -0.02863839454948902,
  

## upload data to Azure AI Search

In [16]:
#connect to AI search 
index_client = SearchIndexClient(endpoint=ai_search_url, credential=ai_search_key)

In [17]:

def insert_into_index(documents):
    """Uploads a list of 'documents' to Azure AI Search index."""

    url = f"https://{ai_search_name}.search.windows.net/indexes/{ai_search_index}/docs/index?api-version=2023-11-01"

    payload = json.dumps({"value": documents})
    headers = {
        "Content-Type": "application/json",
        "api-key": ai_search_key,
    }

    response = requests.request("POST", url, headers=headers, data=payload)

    if response.status_code == 200 or response.status_code == 201:
        return "Success"
    else:
        return f"Failure: {response.text}"

def make_safe_id(row_id: str):
    """Strips disallowed characters from row id for use as Azure AI search document ID."""
    return re.sub("[^0-9a-zA-Z_-]", "_", row_id)

#upload data in batchs of 1000
BATCH_SIZE = 1000
for i in range(0, len(data), BATCH_SIZE):
    batch = data[i:i+BATCH_SIZE]
    status = insert_into_index(batch)
    print(f'upload status: {status}')


upload status: Success
