# Preparation

## Set user name identifier

In order to seperate your collections from other users you have to select a unique identifier for your collections. This will be added to collection names.
A good identifier could be your three letter acronym for your name and the day of your birthday, so for example **wjh17**. Replace `<your identifier>` with your chosen identifier and execute that cell. 

In [None]:
user_identifier = "<your identifier>"

## Project token

Select the (empty) cell below and insert the project token (3-dot menu above). Execute that cell afterwards.


## Milvus connection

Copy the Milvus connectivity info (JSON format) and replace `<connection info in JSON format>` with he copied info in the next cell. Execute that cell afterwards.

In [None]:
#connection=<connection info in JSON format>

# The connection information has been already provided. Please don't change the line below and execute the cell
connection={"connection_name":"Milvus","properties":{"connection":[{"name":"host","value":"30f4a688-d22e-46ff-a1ff-b866c3e4d9d7.cvbhm81d0dmnvl5rjek0.lakehouse.appdomain.cloud"},{"name":"port","value":"30355"},{"name":"database","value":"default"},{"name":"service_id","value":"milvus40"},{"name":"ssl","value":"true"},{"name":"ssl_certificate","value":""},{"name":"engine_type","value":"milvus"}]}}

## API key

Create an API key in IAM, copy it and replace `<api_key>` with the copied API key in the cell below. Execute the cell below afterwards.

In [None]:
api_key = "<api_key>"

ibm_cloud_url = "https://us-south.ml.cloud.ibm.com"
host = connection["properties"]["connection"][0]["value"]
port = connection["properties"]["connection"][1]["value"]

if user_identifier == "<your identifier>":
    print("please go back and set your user_identifier to continue")
    collection_name = ""
else:
    collection_name = "wiki_articles" + "_" + user_identifier
    print(f"you will use collection {collection_name}")

# Prepare content 

## Read data from Wikipedia

In [None]:
!pip install wikipedia | tail -n 1

In [None]:
import wikipedia

# fetch wikipedia articles
articles = [
    {'title': 'Climate', 'pageid': 5999, 'rights': 'climate'},
    {'title': 'Climate change', 'pageid':  5042951, 'rights': 'climate'},
    {'title': 'Climate change mitigation', 'pageid':  2119179, 'rights': 'climate'},
    {'title': 'Climate change adaptation', 'pageid':  4607152, 'rights': 'climate'},
    {'title': 'Economics', 'pageid':  9223, 'rights': 'economics'},
    {'title': 'Keynesian economics', 'pageid':  17326, 'rights': 'economics'},
    {'title': 'Behavioral economics', 'pageid':  177698, 'rights': 'economics'},
    {'title': 'Agricultural economics', 'pageid':  733141, 'rights': 'economics'},
    {'title': 'Labour economics', 'pageid':  18178, 'rights': 'economics'},
]

for article in articles:
    title = article['title']
    pageid = article['pageid']
    
    if pageid:
        wiki_entry = wikipedia.page(pageid=pageid)
    else:
        wiki_entry = wikipedia.page(title)

    article['content'] = wiki_entry.content
    print(f"Successfully fetched {title}")

print(f"Successfully fetched {len(articles)} articles ")

## Split data into chunks

In [None]:
def split_into_chunks(text, chunk_size):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    
def chunk_articles(articles, chunk_size):

    chunks = []
    i=1

    for article in articles:
        title = article['title']
        content = article['content']
        rights = article['rights']
    
        split_articles = split_into_chunks(content, chunk_size)

        for article_chunk in split_articles:

            escaped_chunk = article_chunk.replace("'", "''").replace("%", "%%")
            chunks.append({'id': i, 'chunk': escaped_chunk, 'title': title, 'rights': rights})
            i=i+1

    return chunks
    
chunks = chunk_articles(articles, 255)

print(f"Created {len(chunks)} chunks. Showing some examples:")
for i in range(3):
    print(f"\nChunk #{i}:")
    print(f"  title: {chunks[i]['title']}")
    print(f"  chunk: {chunks[i]['chunk'][0:80]}...")
    print(f"  rights: {chunks[i]['rights']}")

## Load vector embeddings from watsonx.ai

In [None]:
from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.foundation_models import Embeddings
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames as EmbedParams
from ibm_watsonx_ai.foundation_models.utils.enums import EmbeddingTypes

embed_params = {
     EmbedParams.TRUNCATE_INPUT_TOKENS: 3,
     EmbedParams.RETURN_OPTIONS: {
     'input_text': True
     }
 }

embedding = Embeddings(
     model_id=EmbeddingTypes.IBM_SLATE_30M_ENG,
     params=embed_params,
     credentials=Credentials(
         api_key = api_key,
         url = ibm_cloud_url),
     project_id=project.project_context.projectID
     )

In [None]:
# extract text + titles
article_chunks = [item['chunk'] for item in chunks]
article_titles = [item['title'] for item in chunks]
article_rights = [item['rights'] for item in chunks]

# create vector embeddings for chunks
article_vectors = embedding.embed_documents(texts=article_chunks)

# Connect to Milvus

In [None]:
from pymilvus import(
    IndexType,
    Status,
    connections,
    FieldSchema,
    DataType,
    Collection,
    CollectionSchema,
    utility
)

connections.connect(
                alias = 'default',
                host = host,
                port = port,
                user = "ibmlhapikey",
                password = api_key,
                secure = True)

# First variant to implement access rights – filter after query

## Define collection with fields and schema

In [None]:
# Create collection - define fields + schema

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),    # primary key
    FieldSchema(name="article_text", dtype=DataType.VARCHAR, max_length=8000,),     # a chunk of an article text
    FieldSchema(name="article_title", dtype=DataType.VARCHAR, max_length=200,),     # the title of the article where the chunk is originated
    FieldSchema(name="article_rights", dtype=DataType.VARCHAR, max_length=20,),     # the rights attached to the article
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=384),               # the embedding vector
]

schema = CollectionSchema(fields, "wikipedia article collection schema")

wiki_collection = Collection(collection_name, schema)

# Create index for Vector data

index_params = {
        'metric_type':'L2',
        'index_type':"IVF_FLAT",
        'params':{"nlist":10}
}

wiki_collection.create_index(field_name="vector", index_params=index_params)

In [None]:
# List collections available in Milvus
print("Collections found in Milvus database: ", utility.list_collections())

# Get number of entities in collection. Should be 0 as we have not loaded any data
print("Number of entities in Collection: ", wiki_collection.num_entities)

## Store data in Milvus

In [None]:
# Prepare data suitable for loading

data = [
    article_chunks,
    article_titles,
    article_rights,
    article_vectors
]

# Insert data into collection

wiki_collection.insert(data)
wiki_collection.flush()  # Ensures data persistence

# Load collection to allow queries

wiki_collection.load()

# Get number of entities in collection. Now we should see entities

print("Number of entities in Collection: ", wiki_collection.num_entities)

## Create functions to query milvus

In [None]:
# Query function for Milvus using just similarity search

def query_milvus(query, num_results=3):
    
    # 1. Vectorize query text for similarity
    query_embeddings = embedding.embed_documents(texts=[query])
    
    # 2. Search Parameters
    search_params = {
        "metric_type": "L2", 
        "params": {"nprobe": 5}
    }

    # 3. Search collection
    results = wiki_collection.search(
        data=query_embeddings, 
        anns_field="vector", 
        param=search_params,
        limit=num_results,
        output_fields=['article_text', 'article_rights']
    )
    return results

In [None]:
# Function to extract relevant chunks from search results
# use verbose=True to see details for the chunks

def get_relevant_chunks(results, verbose=False):
    
    relevant_chunks = []
    i=0

    for res in results[0]:
        text = res.entity.get('article_text')
        rights = res.entity.get('article_rights')
        relevant_chunks.append(text)
        if verbose:
            print(f"Chunk #{i}:")
            print(f"  id: {res.id}")
            print(f"  distance: {res.distance}")
            print(f"  chunk: {text[:70]}...")
            print(f"  righs: {rights}\n")
            i = i+1

    return relevant_chunks

## Run a query against Milvus

In [None]:
# set the number of chunks wanted in the answer
num_results = 3

# Query against Milvus
question_text = 'what are ways to cope with climate change?'

# Execute query and receive chunks with details
results = query_milvus(question_text, num_results)
chunks = get_relevant_chunks(results, True)

In [None]:
# Query function for Milvus using similarity search and offset to skip results

def query_milvus(query, num_results=3, num_offset=0):
    
    # 1. Vectorize query text for similarity
    query_embeddings = embedding.embed_documents(texts=[query])
    
    # 2. Search Parameters
    search_params = {
        "metric_type": "L2", 
        "params": {"nprobe": 5}
    }

    # 3. Search collection
    results = wiki_collection.search(
        data=query_embeddings, 
        anns_field="vector", 
        param=search_params,
        limit=num_results,
        offset=num_offset,                                 # <- skipping results
        output_fields=['article_text', 'article_rights']
    )
    return results

In [None]:
# set the number of chunks wanted in the answer
num_results = 3

# Query against Milvus
question_text = 'what are ways to cope with climate change?'

# Execute query and receive chunks with details
for num_offset in range(0, 3*num_results, num_results):
    print(f"Query Milvus with offset {num_offset}")
    print( "==========================\n")
    results = query_milvus(question_text, num_results, num_offset)
    chunks = get_relevant_chunks(results, True)

# Second variant to implement access rights - filter access rights in Milvus query

## Create an index for article_rights and load for efficient filtering

In [None]:

# Create scalar index on article_rights
index_params = {
    "index_type": "INVERTED", # or "AUTOINDEX"
    "params": {}
}
wiki_collection.create_index(field_name="article_rights", index_params=index_params)

# Load collection with filtering enabled. With this setting, Milvus combines the scalar index with the vector index
wiki_collection.load(_filterable=True)

## Create a query function with filter

In [None]:
# Query function for Milvus using similarity search in combination with scalar search (access rights)

def query_milvus_filter(query, num_results=3, user_access_rigths=None):
    
    # 1. Vectorize query text for similarity
    query_embeddings = embedding.embed_documents(texts=[query])
    
    # 2. Search Parameters
    search_params = {
        "metric_type": "L2", 
        "params": {"nprobe": 5}
    }

    # 3. Build search expression for access rights
    if user_access_rigths:
        expr = " or ".join([f'article_rights=="{right}"' for right in user_access_rigths])
    else:
        expr = ""
    
    # 4. Search collection
    results = wiki_collection.search(
        data=query_embeddings, 
        anns_field="vector", 
        param=search_params,
        limit=num_results,
        output_fields=['article_text', 'article_rights'],
        expr=expr                                           # <- scalar search expression 
    )
    return results

## Run a query with given access rights against Milvus

In [None]:
# set the number of chunks wanted in the answer
num_results = 3

# Access rights of user
user_access_rights = ['climate']

# Query against Milvus
question_text = 'what are ways to cope with climate change?'

# Execute query and receive chunks with details
results = query_milvus_filter(question_text, num_results, user_access_rights)
chunks = get_relevant_chunks(results, True)

# Third variant to implement access rights – partition the collection

## Create a new collection with partitions

In [None]:
# new name for the collection with partitions
collection_name_partitioned = collection_name + '_partitioned'

# Create collection - define fields + schema

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),    # primary key
    FieldSchema(name="article_text", dtype=DataType.VARCHAR, max_length=8000,),     # a chunk of an article text
    FieldSchema(name="article_title", dtype=DataType.VARCHAR, max_length=200,),     # the title of the article where the chunk is originated
    FieldSchema(name="article_rights", dtype=DataType.VARCHAR, max_length=20,),     # the rights attached to the article
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=384),               # the embedding vector
]

schema = CollectionSchema(fields, "wikipedia article collection schema")

wiki_collection_partitioned = Collection(collection_name_partitioned, schema)

# Create index for Vector data

index_params = {
        'metric_type':'L2',
        'index_type':"IVF_FLAT",
        'params':{"nlist":10}
}

wiki_collection_partitioned.create_index(field_name="vector", index_params=index_params)

# Create partitions

out=wiki_collection_partitioned.create_partition("climate")
out=wiki_collection_partitioned.create_partition("economics")

## Store data in Milvus partitions 

In [None]:
# Prepare data suitable for loading in partitions

article_chunks_climate = [x for x, y in zip(article_chunks, article_rights) if y == 'climate']
article_titles_climate = [x for x, y in zip(article_titles, article_rights) if y == 'climate']
article_rights_climate = [x for x, y in zip(article_rights, article_rights) if y == 'climate']
article_vectors_climate = [x for x, y in zip(article_vectors, article_rights) if y == 'climate']

data_climate = [
    article_chunks_climate,
    article_titles_climate,
    article_rights_climate,
    article_vectors_climate
]

article_chunks_economics = [x for x, y in zip(article_chunks, article_rights) if y == 'economics']
article_titles_economics = [x for x, y in zip(article_titles, article_rights) if y == 'economics']
article_rights_economics = [x for x, y in zip(article_rights, article_rights) if y == 'economics']
article_vectors_economics = [x for x, y in zip(article_vectors, article_rights) if y == 'economics']

data_economics = [
    article_chunks_economics,
    article_titles_economics,
    article_rights_economics,
    article_vectors_economics
]

# Insert data into collection partitions

wiki_collection_partitioned.insert(data_climate, partition_name="climate")
wiki_collection_partitioned.insert(data_economics, partition_name="economics")
wiki_collection_partitioned.flush() # Ensures data persistence

# Load collection to allow queries

wiki_collection_partitioned.load()

# Get number of entities in collection and partitions

print("Number of entities in collection: ", wiki_collection_partitioned.num_entities)
print("Number of entities in partition 'climate': ", wiki_collection_partitioned.partition(partition_name="climate").num_entities)
print("Number of entities in partition 'economics': ", wiki_collection_partitioned.partition(partition_name="economics").num_entities)

## Prepare functions for Milvus queries with partitions

In [None]:
# Query function for Milvus using similarity search in combination with partitions (access rights)

def query_milvus_partitions(query, num_results=3, partitions=None):
    
    # 1. Vectorize query text for similarity
    query_embeddings = embedding.embed_documents(texts=[query])
    
    # 2. Search Parameters
    search_params = {
        "metric_type": "L2", 
        "params": {"nprobe": 5}
    }

    # 3. Search collection
    results = wiki_collection_partitioned.search(
        data=query_embeddings, 
        anns_field="vector", 
        param=search_params,
        limit=num_results,
        output_fields=['article_text', 'article_rights'],
        partition_names=partitions                          # <- restriction on partitions 
    )
    return results

## Run a query with given access rights against Milvus (with partitions)

In [None]:
# set the number of chunks wanted in the answer
num_results = 3

# Access rights of user
user_access_rights = ['climate']

# Query against Milvus
question_text = 'what are ways to cope with climate change?'

# Execute query and receive chunks with details
results = query_milvus_partitions(question_text, num_results, user_access_rights)
chunks = get_relevant_chunks(results, True)

# Prepare functions to create answers (with LLM)

## Load LLM from watsonx.ai for answer generation

In [None]:
def load_model(model_id):
    #        model_id='meta-llama/llama-2-70b-chat'
    #        model_id='mistralai/mixtral-8x7b-instruct-v01'

    from ibm_watsonx_ai.foundation_models import Model
    from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams

    creds = {
        "url": ibm_cloud_url,
        "apikey": api_key 
    }

    # Model Parameters
    params = {
        GenParams.DECODING_METHOD: "greedy",
        GenParams.MIN_NEW_TOKENS: 1,
        GenParams.MAX_NEW_TOKENS: 500,
        GenParams.TEMPERATURE: 0,
    }

    try:
        model = Model(model_id=model_id, 
            params=params, credentials=creds, 
            project_id=project.project_context.projectID
        )
        print(f"Model {model_id} loaded")
        return model
    except Exception as e:
        logger.error(f"load_model> error loading model: {str(e)}")
        print(f"load_model> error loading model: {str(e)}")

    return None

model = load_model(model_id='ibm/granite-3-2-8b-instruct')

## Define a simple prompt for LLM

In [None]:
def ask_llm(prompt, model):
    response = model.generate_text(prompt)
    return response

def set_prompt_template(new_template):
    from string import Template

    global prompt_template

    if new_template == '':
        prompt_template=Template("$context\n\nPlease answer a question using this text. "
          + "If the question is unanswerable, say \"unanswerable\"."
          + "\n\nQuestion: $question")
    else:
        prompt_template=Template(new_template)

    return(prompt_template)

set_prompt_template('')

def make_prompt(context, question):
    context = "\n\n".join(context)
    data={"context": context, "question": question}
    prompt = prompt_template.substitute(data)
    return prompt

# Generate answer with context from Milvus search 

In [None]:
question_text = "what roles does co2 play in global warming?"
#question_text = "what are the drivers for a healthy economics?"
access_rights = ["climate", "economics"] # None # "<your access rights>"
num_results = 3

result = query_milvus_partitions(question_text, num_results, access_rights)
prompt = make_prompt(get_relevant_chunks(result, True), question_text)

response = ask_llm(prompt, model)
print(response)

# Remove Collections in Milvus

In [None]:
wiki_collection.drop()
wiki_collection_partitioned.drop()