## This book Upload data into AI Search 
1. read csv file  into data frame 
2. Split csv into wanted format and save in dataframe
2. read rows of dataframe into list of dictionaries
3. Upload into the AI search index. 

It is important to note multiple csv with the same column headers and numbers can be read and aggregated into one dataframe and a bulk upload can be done into Azure AI search. However, this is notebook uploads each csv into AI Search. 


In [1]:
#import libraries
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
import json
import pandas as pd
from dotenv import dotenv_values
from azure.search.documents.indexes import SearchIndexClient
import requests
import re


In [2]:
#import Azure AI Search credentials (replace with personal credentials)
config = dotenv_values('/Users/pelumioluwaabiola/Desktop/Transcriptions/credential.env')
ai_search_location = config['ai_search_location'] 
ai_search_key = config['ai_search_key']
ai_search_url = config['ai_search_url']
ai_search_index = 'oewg-speech-meeeting-index'
ai_search_name = 'aicpcigi'
embedding_length = 768

In [137]:
#path of csv file . (change the path to upload personal csv file)
csv_file = '/Users/pelumioluwaabiola/Desktop/Transcriptions/csv/Meeting_10_Session_3.csv'


In [138]:
df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,Id,Session,Meeting,Speaker,Text
0,S3M101,3,10,Chairman,"Good afternoon, Your Excellencies. Distinguish..."
1,S3M102,3,10,United States,"Thank you, Mr. Chairman. Thank you for the opp..."
2,S3M103,3,10,Chairman,I thank the distinguished representative of th...
3,S3M104,3,10,United Kingdom,"Mr. Chair, thank you very much for giving me t..."
4,S3M105,3,10,Chairman,I thank the distinguished representative of th...


In [5]:
#define the text splitter
def chucking_text(text):
    textsplitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=128,
        length_function=len,
        is_separator_regex=False,
    )
    TextChunks = textsplitter.split_text(text)
    return TextChunks

In [139]:

# Initialize an empty DataFrame to store chunked data
df_chunks = pd.DataFrame(columns=['Id', 'Session', 'Meeting', 'Speaker', 'Text'])

# Iterate through each row of the original DataFrame
for index, row in df.iterrows():
    text = row['Text']
    id_ = row['Id']
    session = row['Session']
    meeting = row['Meeting']
    speaker = row['Speaker']
    
    # Chunk the text
    chunks = chucking_text(text)
    
    # Create chunk IDs ensuring uniqueness
    chunk_ids = [f'{id_}_{i+1}' for i in range(len(chunks))]  # Use index to ensure uniqueness
    
    # Expand session, meeting, and speaker lists to match the number of chunks
    session = [session] * len(chunks)
    meeting = [meeting] * len(chunks)
    speaker = [speaker] * len(chunks)
    
    # Create a DataFrame for the chunks of the current row
    df_temp = pd.DataFrame({
        'Id': chunk_ids,
        'Session': session,
        'Meeting': meeting,
        'Speaker': speaker,
        'Text': chunks
    })
    
    # Append the chunked data to the main DataFrame
    df_chunks = pd.concat([df_chunks, df_temp])

# Reset the index of the resulting DataFrame
df_chunks.reset_index(drop=True, inplace=True)

# Print the resulting DataFrame
df_chunks.head()


Unnamed: 0,Id,Session,Meeting,Speaker,Text
0,S3M101_1,3,10,Chairman,"Good afternoon, Your Excellencies. Distinguish..."
1,S3M102_1,3,10,United States,"Thank you, Mr. Chairman. Thank you for the opp..."
2,S3M102_2,3,10,United States,"to space systems, including, as appropriate, h..."
3,S3M102_3,3,10,United States,and seeking legally binding treaties on these ...
4,S3M102_4,3,10,United States,should be in any way seen as fragmenting the U...


In [140]:
#check rows with duplicate values in id
duplicate_rows = df_chunks[df_chunks['Id'].duplicated(keep=False)]
duplicate_rows

Unnamed: 0,Id,Session,Meeting,Speaker,Text


In [92]:
#convert data to vector embeddings
def generate_embeddings(text):
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    embeddings = embedding_model.embed_query(text)
    return embeddings


In [141]:
#create a new column in the df_chunks with the embeddings and add the embeded each text 
df_chunks['TextEmbeddings'] = df_chunks['Text'].apply(generate_embeddings)

In [142]:
df_chunks.head()

Unnamed: 0,Id,Session,Meeting,Speaker,Text,TextEmbeddings
0,S3M101_1,3,10,Chairman,"Good afternoon, Your Excellencies. Distinguish...","[0.006036779377609491, 0.06959816068410873, 0...."
1,S3M102_1,3,10,United States,"Thank you, Mr. Chairman. Thank you for the opp...","[0.07456157356500626, -0.012868503108620644, 0..."
2,S3M102_2,3,10,United States,"to space systems, including, as appropriate, h...","[0.09709066152572632, -0.0029819004703313112, ..."
3,S3M102_3,3,10,United States,and seeking legally binding treaties on these ...,"[0.06501833349466324, 0.06349848955869675, 0.0..."
4,S3M102_4,3,10,United States,should be in any way seen as fragmenting the U...,"[0.0529852956533432, 0.021406706422567368, 0.0..."


In [143]:
#save df_chunks as a csv file #change name of file s(number of session)m(number of meeting).csv
df_chunks.to_csv('/Users/pelumioluwaabiola/Desktop/Transcriptions/session2/s3m10.csv', index=False)

In [144]:
df = pd.read_csv('/Users/pelumioluwaabiola/Downloads/un_oewg_meeting_sentiment_analysis (1).csv')
df.head()

Unnamed: 0,Id,Session,Meeting,Speaker,Text,TextEmbeddings,cluster_label
0,S2M31_1,2,3,Chairman,"Dear colleagues, Excellencies, please take you...","[0.06515908241271973, -0.009579229168593884, 0...",not assigned
1,S2M31_2,2,3,Chairman,"we will focus. On discussions on topic 2, Curr...","[0.0666637122631073, 0.03770853951573372, 0.02...",not assigned
2,S2M31_3,2,3,Chairman,of the National Academy of Political and Strat...,"[-0.0026448278222233057, 0.117243193089962, 0....",not assigned
3,S2M32_1,2,3,The Secure World Foundation,"Thank you, Mr. Chair. My name is Victoria Samp...","[0.050753768533468246, 0.08790837973356247, 0....",0.0
4,S2M32_2,2,3,The Secure World Foundation,to develop and promote international cooperati...,"[0.07907143235206604, -0.009904933162033558, 0...",3.0


In [152]:
#read row by row and append to a list
data = []
for index, row in df.iterrows():
    row_dict = {}
    row_dict['id'] = row['Id']
    row_dict['Session'] = str(row['Session'])
    row_dict['Meeting'] = str(row['Meeting'])
    row_dict['Speaker'] = row['Speaker']
    row_dict['Text'] = row['Text']
    row_dict['ClusterLabel'] = row['cluster_label']
    #convert text embeddings to list
    row_dict['TextEmbeddings'] = json.loads(row['TextEmbeddings'])
    row_dict['@search.action'] = 'upload'
    data.append(row_dict)

In [153]:
#view first element of data
data[0]

{'id': 'S2M31_1',
 'Session': '2',
 'Meeting': '3',
 'Speaker': 'Chairman',
 'Text': 'Dear colleagues, Excellencies, please take your seats. Good morning. We will now continue with Agenda Item 6B entitled Reducing Space Threats Through Norms, Rules, and Principles of Responsible Behaviors. To consider current and future threats by states to space systems and actions, activities and omissions that could be considered irresponsible. As for the indicative timetable, today we will focus. On discussions on topic 2, Current and future Earth to space Threats by States to space systems. We will now',
 'ClusterLabel': 'not assigned',
 'TextEmbeddings': [0.06515908241271973,
  -0.009579229168593884,
  0.014277279376983643,
  -0.006449003703892231,
  -0.04850471392273903,
  -0.033230796456336975,
  0.03900676220655441,
  -0.022101029753684998,
  0.05295374616980553,
  0.02985985390841961,
  0.03417133912444115,
  -0.005891353357583284,
  -0.03349222242832184,
  0.06548155844211578,
  0.0421979799

## upload data to Azure AI Search

In [150]:
#connect to AI search 
index_client = SearchIndexClient(endpoint=ai_search_url, credential=ai_search_key)

In [154]:

def insert_into_index(documents):
    """Uploads a list of 'documents' to Azure AI Search index."""

    url = f"https://{ai_search_name}.search.windows.net/indexes/{ai_search_index}/docs/index?api-version=2023-11-01"

    payload = json.dumps({"value": documents})
    headers = {
        "Content-Type": "application/json",
        "api-key": ai_search_key,
    }

    response = requests.request("POST", url, headers=headers, data=payload)

    if response.status_code == 200 or response.status_code == 201:
        return "Success"
    else:
        return f"Failure: {response.text}"

def make_safe_id(row_id: str):
    """Strips disallowed characters from row id for use as Azure AI search document ID."""
    return re.sub("[^0-9a-zA-Z_-]", "_", row_id)

#upload data in batchs of 1000
BATCH_SIZE = 1000
for i in range(0, len(data), BATCH_SIZE):
    batch = data[i:i+BATCH_SIZE]
    status = insert_into_index(batch)
    print(f'upload status: {status}')


upload status: Success
upload status: Success
upload status: Success
upload status: Success
upload status: Success
upload status: Success
upload status: Success
upload status: Success
upload status: Success
upload status: Success
