In [None]:
# %pip install azure-storage-blob


In [13]:
import argparse
import requests
import pandas as pd
import json
from azure.storage.blob import BlobServiceClient
from io import StringIO

# Step 1: Perform a full search to get all documents in the index
def search_index(base_url, index_name, api_key):
    """
    Queries an Azure Cognitive Search index to retrieve all documents.
    
    Args:
        base_url (str): The base URL of the Azure Cognitive Search service.
        index_name (str): The name of the search index.
        api_key (str): The API key for authenticating requests to the search service.
        
    Returns:
        dict: A dictionary where the keys are `id` and the values are `content` for each document.
        Returns an empty dictionary if there is an error.
    """
    search_url = f"{base_url}/indexes/{index_name}/docs/search?api-version=2024-07-01"
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key
    }
    query = {
        "search": "*",
        "select": "id, content"
    }
    
    # Make the search request
    response = requests.post(search_url, headers=headers, json=query)
    
    if response.status_code == 200:
        # Parse and return search results
        search_results = response.json()
        return {doc["id"]: doc["content"] for doc in search_results["value"]}
    else:
        # Handle any errors
        print(f"Error during search: {response.status_code}, {response.text}")
        return {}

# Step 2: Load CSV file from Azure Blob Storage
def load_csv_from_blob(connection_string, container_name, blob_name):
    """
    Downloads and reads a CSV file from Azure Blob Storage, and converts it to a dictionary.
    
    Args:
        connection_string (str): Connection string to the Azure Blob Storage account.
        container_name (str): The name of the Blob Storage container.
        blob_name (str): The name of the CSV file in the container.
        
    Returns:
        dict: A dictionary where the keys are `id` and the values are `content` for each record in the CSV.
    """
    # Create BlobServiceClient using the connection string
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    container_client = blob_service_client.get_container_client(container_name)
    blob_client = container_client.get_blob_client(blob_name)
    
    # Download the blob's content as a string
    download_stream = blob_client.download_blob()
    csv_data = download_stream.readall().decode("utf-8")
    
    # Load CSV into pandas DataFrame from the string content
    df = pd.read_csv(StringIO(csv_data))
    
    # Convert CSV into a dictionary with `id` as key
    return {f'{row["id"]}': row["content"] for _, row in df.iterrows()}

# Step 3: Delete records from the index that are not in the CSV
def delete_documents_from_index(base_url, index_name, api_key, doc_ids_to_delete, index_data):
    """
    Deletes documents from the Azure Cognitive Search index based on provided `id`s.
    
    Args:
        base_url (str): The base URL of the Azure Cognitive Search service.
        index_name (str): The name of the search index.
        api_key (str): The API key for authenticating requests to the search service.
        doc_ids_to_delete (list): A list of `id`s that need to be deleted from the index.
        index_data (dict): Current data in the index, with `id` as key and `content` as value.
        
    Returns:
        None
    """
    delete_url = f"{base_url}/indexes/{index_name}/docs/index?api-version=2024-07-01"
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key
    }
    
    # Log the documents to be deleted
    print("Deleting the following documents:")
    for doc_id in doc_ids_to_delete:
        content = index_data.get(doc_id, "Content not available")
        print(f"ID: {doc_id}, {content[:30]}...")
    
    # Prepare the data for deletion
    delete_data = {
        "value": [{"@search.action": "delete", "id": doc_id} for doc_id in doc_ids_to_delete]
    }

    # Send the delete request to Azure Cognitive Search
    response = requests.post(delete_url, headers=headers, json=delete_data)
    
    if response.status_code == 200:
        print(f"Successfully deleted {len(doc_ids_to_delete)} documents.")
    else:
        print(f"Error during deletion: {response.status_code}, {response.text}")

In [14]:
from dotenv import load_dotenv
load_dotenv()
import os

base_url = os.getenv("SEARCH_SERVICE_URL")
index_name = os.getenv("INDEX_NAME")
api_key = os.getenv("SEARCH_API_KEY")   


In [15]:
# Get all records from the index
index_data = search_index(base_url, index_name, api_key)

In [16]:
for key, value in index_data.items():
    print(f"{key}: {value}")

3: Gastronomic Landscape Hotel -1
4: Sublime Palace Hotel
2: Old Century Hotel -1
1: Stay-Kay City Hotel -1


In [17]:
load_dotenv()
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
container_name = os.getenv("AZURE_STORAGE_CONTAINER_NAME")
blob_name = os.getenv("AZURE_STORAGE_BLOB_NAME")

In [19]:
# Load the CSV data from Azure Blob Storage
csv_data = load_csv_from_blob(connection_string, container_name, blob_name)

In [20]:
for key, value in csv_data.items():
    print(f"{key}: {value}")

1: Stay-Kay City Hotel -1
2: Old Century Hotel -1
3: Gastronomic Landscape Hotel -1


In [21]:
print(list(index_data.keys()))
print(list(csv_data.keys()))

['3', '4', '2', '1']
['1', '2', '3']


In [22]:

# Find records in the CSV that are not in the index
missing_ids = [doc_id for doc_id in index_data if doc_id not in csv_data]


print(f"Missing IDs: {missing_ids}")


Missing IDs: ['4']


In [23]:

# If there are missing records, delete them from the index
if missing_ids:
    delete_documents_from_index(base_url, index_name, api_key, missing_ids, index_data)
else:
    print("No documents to delete. All records in the CSV are present in the index.")

Deleting the following documents:
ID: 4, Sublime Palace Hotel...
Successfully deleted 1 documents.
