In [1]:
from dotenv import load_dotenv,dotenv_values
import json
import os
from pathlib import Path

In [2]:
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient, SearchIndexingBufferedSender  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryCaptionResult,
    QueryAnswerResult,
    SemanticErrorMode,
    SemanticErrorReason,
    SemanticSearchResultsType,
    QueryType,
    VectorizedQuery,
    VectorQuery,
    VectorFilterMode,    
)
from azure.search.documents.indexes.models import (  
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SemanticPrioritizedFields,
    SemanticField,  
    SearchField,  
    SemanticSearch,
    VectorSearch,  
    HnswAlgorithmConfiguration,
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SemanticField,  
    SearchField,  
    VectorSearch,  
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
)  
  

In [3]:
load_dotenv(Path("D:\Projects\Multi_lang hackathon\secrets.env"))  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
credential = AzureKeyCredential(key)

  load_dotenv(Path("D:\Projects\Multi_lang hackathon\secrets.env"))


# Create Vector Store

Retrievable: title, website, address, city, lati, longi
Searchable: title, primary topic, city, full_text, address
Filterable: none
sortable: none
facetable: none


In [5]:
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, 
                key=True, sortable=True, 
                filterable=True, facetable=True), 
    SearchableField(name="title", type=SearchFieldDataType.String,
                    searchable=True, retrievable= True), 
    SearchableField(name="primaryTopic", type=SearchFieldDataType.String,
                    searchable=True, retrievable= False,),
    SearchableField(name="address", type=SearchFieldDataType.String,
                    searchable=False, retrievable= True),
    SearchableField(name="city", type=SearchFieldDataType.String,
                    searchable=True, retrievable= True),
    SearchableField(name="website", type=SearchFieldDataType.String,
                    searchable=False, retrievable= True),
    SearchableField(name="latitude", type=SearchFieldDataType.String,
                    searchable=False, retrievable= True),
    SearchableField(name="longitude", type=SearchFieldDataType.String,
                    searchable=False, retrievable= True), 
    SearchableField(name="full_text", type=SearchFieldDataType.String,
                    searchable=True, retrievable= False),                                                                                    
    SearchField(name="embeddings", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, 
                vector_search_profile_name="myHnswProfile") 
]

In [6]:
# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
        )
    ]
)

In [7]:
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[SemanticField(field_name="title"), SemanticField(field_name="full_text"),],
        keywords_fields=[SemanticField(field_name="primaryTopic"), SemanticField(field_name="city")]
    )
)

In [8]:
index_name

'pari_orgs_index_0189'

In [9]:
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, 
                    semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 pari_orgs_index_0189 created


In [21]:
with open('socialmaps-data-exploration\docVectors_azure.json', 'r', encoding='utf-8-sig') as file:  
    documents = json.load(file)

  with open('socialmaps-data-exploration\docVectors_azure.json', 'r', encoding='utf-8-sig') as file:


In [11]:
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
 

In [23]:
for orgs in documents:
    orgs['id']= str(orgs['id'])
    orgs['latitude']= str(orgs['latitude'])
    orgs['longitude']= str(orgs['longitude'])

In [26]:
import pandas as pd
def fill_nan(dictionary):
    for key, value in dictionary.items():
        try:
            if pd.isnull(value):
                dictionary[key] = ''
        except:
            continue
    return dictionary

for doc in documents:
    doc=fill_nan(doc)

In [27]:
documents[78]

{'id': '78',
 'title': 'Ankommen in Spandau: Mobile Sozialberatung',
 'primaryTopic': 'Gehäuse',
 'address': 'mobil in ganz Spandau',
 'city': '',
 'website': 'https://giz.berlin/projects/MoS.htm',
 'latitude': '52.53800237397766',
 'longitude': '13.20488574054932',
 'full_text': 'Name der Organisation: Ankommen in Spandau: Mobile Sozialberatung\n                Primäre Domäne:  Senioren, Familien, Frauen, queer, Männer, Fluchthintergrund, Migrationshintergrund, obdachlos, spandau, Englisch, persönlich, telefonisch, Dienst, o ri gesintzu, Beratung, Gehäuse\n                Andere Domains: Gehäuse\n                Kurz: Ein Angebot für geflüchtete und zugewanderte Menschen, die von Wohnungslosigkeit betroffen sind.\n                Description: Die Mobile Sozialberatung richtet sich an geflüchtete und zugewanderte Menschen, die von Wohnungslosigkeit betroffen sind.\nDas Angebot ist unabhängig, kostenlos, vertraulich und erfolgt mobil in ganz Spandau.\n\nThemen der Beratung (u.a.):\n- Wo

In [28]:
for ind,orgs in enumerate(documents):
    try:
        result = search_client.upload_documents(orgs)
        print("uploaded", ind)
    except Exception as e:
        print(orgs)
        print(e)
        
print(f"Uploaded {len(documents)} documents")

uploaded 0
uploaded 1
uploaded 2
uploaded 3
uploaded 4
uploaded 5
uploaded 6
uploaded 7
uploaded 8
uploaded 9
uploaded 10
uploaded 11
uploaded 12
uploaded 13
uploaded 14
uploaded 15
uploaded 16
uploaded 17
uploaded 18
uploaded 19
uploaded 20
uploaded 21
uploaded 22
uploaded 23
uploaded 24
uploaded 25
uploaded 26
uploaded 27
uploaded 28
uploaded 29
uploaded 30
uploaded 31
uploaded 32
uploaded 33
uploaded 34
uploaded 35
uploaded 36
uploaded 37
uploaded 38
uploaded 39
uploaded 40
uploaded 41
uploaded 42
uploaded 43
uploaded 44
uploaded 45
uploaded 46
uploaded 47
uploaded 48
uploaded 49
uploaded 50
uploaded 51
uploaded 52
uploaded 53
uploaded 54
uploaded 55
uploaded 56
uploaded 57
uploaded 58
uploaded 59
uploaded 60
uploaded 61
uploaded 62
uploaded 63
uploaded 64
uploaded 65
uploaded 66
uploaded 67
uploaded 68
uploaded 69
uploaded 70
uploaded 71
uploaded 72
uploaded 73
uploaded 74
uploaded 75
uploaded 76
uploaded 77
uploaded 78
uploaded 79
uploaded 80
uploaded 81
uploaded 82
uploaded 83
up

# Hybrid Search

In [4]:
import openai

openai.api_type = "azure"
openai.api_key = os.getenv("embedding_key")
openai.api_base = os.getenv("embedding_url")
openai.api_version = "2023-05-15"

In [5]:

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], engine=model).data[0].embedding

In [6]:
query= "wo ich Arbeit finden kann?"
query_vector= get_embedding(query)


In [15]:
query= "Suchthilfeeinrichtungen in Berlin"
query_vector= get_embedding(query)

In [7]:
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))  
vector_query = VectorizedQuery(vector=query_vector, 
                               k_nearest_neighbors=3, 
                               fields="embeddings")

results = search_client.search(  
    search_text=query,  
    vector_queries=[vector_query],
    select=["title", "website", "full_text"],
    top=3
)  
  
for result in results:  
    print(result)

{'full_text': 'Name der Organisation: Zuverdienstwerkstatt Neukölln |\n                Primäre Domäne:  Hintergrund der Behinderung, Neukölln, barrierefreies Badezimmer, Geistige Behinderung 2, Dienst, Arbeit, o ri unisgg\n                Andere Domains: Arbeit\n                Kurz: UNIONHILFSWERK \n                Description: Die Zuverdienstwerkstatt Neukölln des UNIONHILFSWERK bietet Menschen mit psychischen Erkrankungen verschiedene Beschäftigungsmöglichkeiten. Das wirkt motivierend, wenn es - vorübergehend - nicht möglich ist, auf dem allgemeinen Arbeitsmarkt tätig zu sein. Vorerfahrungen sind in der Regel nicht erforderlich, der Zugang zur Beschäftigung ist niederschwellig.Unsere Angebote tragen dazu bei, das Selbstwertgefühl und die Ressourcen der Beschäftigten durch sinnvolle Betätigung zu stärken. Die regelmäßige Arbeit bietet ihnen eine Tagesstruktur und schafft Raum für soziale Kontakte. Verantwortung und Selbstständigkeit fördern die psychische Gesundheit und erhöhen die i

# Langchain integration of azure AI Search

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch

In [None]:
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002", chunk_size=1)
index_name: str = "langchain-vector-demo"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)