# Preparation

## Set user name identifier

In order to seperate your collections from other users you have to select a unique identifier for your collections. This will be added to collection names.
A good identifier could be your three letter acronym for your name and the day of your birthday, so for example **wjh17**. Replace `<your identifier>` with your chosen identifier and execute that cell. 

In [None]:
user_identifier = "<your identifier>"

## Project token

Select the (empty) cell below and insert the project token (3-dot menu above). Execute that cell afterwards.


## Milvus connection

Copy the Milvus connectivity info (JSON format) and replace `<connection info in JSON format>` with he copied info in the next cell. Execute that cell afterwards.

In [None]:
connection=<connection info in JSON format>

## API key

Create an API key in IAM, copy it and replace `<api_key>` with the copied API key in the cell below. Execute the cell below afterwards.

In [None]:
api_key = "<api_key>"

ibm_cloud_url = "https://us-south.ml.cloud.ibm.com"
host = connection["properties"]["connection"][0]["value"]
port = connection["properties"]["connection"][1]["value"]

if user_identifier == "<your identifier>":
    print("please go back and set your user_identifier to continue")
    collection_name = ""
else:
    collection_name = "wiki_articles" + "_" + user_identifier
    print(f"you will use collection {collection_name}")

# Setting up Milvus

## Connect to Milvus

In [None]:
from pymilvus import(
    IndexType,
    Status,
    connections,
    FieldSchema,
    DataType,
    Collection,
    CollectionSchema,
    utility
)

connections.connect(
                alias = 'default',
                host = host,
                port = port,
                user = "ibmlhapikey",
                password = api_key,
                secure = True)

## Define collection with fields and schema

In [None]:
# Create collection - define fields + schema

fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), # Primary key
    FieldSchema(name="article_text", dtype=DataType.VARCHAR, max_length=8000,),
    FieldSchema(name="article_title", dtype=DataType.VARCHAR, max_length=200,),
    FieldSchema(name="article_rights", dtype=DataType.VARCHAR, max_length=20,),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=384),
]

schema = CollectionSchema(fields, "wikipedia article collection schema")

# wiki_collection.drop() # if the existing collection has to be recreated, uncomment the drop() command by removing # in front
wiki_collection = Collection(collection_name, schema)

# Create index
index_params = {
        'metric_type':'L2',
        'index_type':"IVF_FLAT",
        'params':{"nlist":10}
}

wiki_collection.create_index(field_name="vector", index_params=index_params)


# Create scalar index on article_rights
index_params = {
    "index_type": "INVERTED", # or "AUTOINDEX"
    "params": {}
}
wiki_collection.create_index(field_name="article_rights", index_params=index_params)


In [None]:
utility.list_collections()

In [None]:
basic_collection = Collection(collection_name) 
basic_collection.num_entities 

# Prepare content 

## Read data from Wikipedia

In [None]:
!pip install wikipedia | tail -n 1

In [None]:
import wikipedia

# fetch wikipedia articles
articles = [
    {'title': 'Climate', 'pageid': 5999, 'rights': 'climate'},
    {'title': 'Climate change', 'pageid':  5042951, 'rights': 'climate'},
    {'title': 'Climate change mitigation', 'pageid':  2119179, 'rights': 'climate'},
    {'title': 'Climate change adaptation', 'pageid':  4607152, 'rights': 'climate'},
    {'title': 'Economics', 'pageid':  9223, 'rights': 'economics'},
    {'title': 'Keynesian economics', 'pageid':  17326, 'rights': 'economics'},
    {'title': 'Behavioral economics', 'pageid':  177698, 'rights': 'economics'},
    {'title': 'Agricultural economics', 'pageid':  733141, 'rights': 'economics'},
    {'title': 'Labour economics', 'pageid':  18178, 'rights': 'economics'},
]

for article in articles:
    title = article['title']
    pageid = article['pageid']
    
    if pageid:
        wiki_entry = wikipedia.page(pageid=pageid)
    else:
        wiki_entry = wikipedia.page(title)

    article['content'] = wiki_entry.content
    print(f"Successfully fetched {title}")

print(f"Successfully fetched {len(articles)} articles ")

## Split data into chunks

In [None]:
def split_into_chunks(text, chunk_size):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    
def chunk_articles(articles, chunk_size):

    chunks = []
    i=1

    for article in articles:
        title = article['title']
        content = article['content']
        rights = article['rights']
    
        split_articles = split_into_chunks(content, chunk_size)

        for article_chunk in split_articles:

            escaped_chunk = article_chunk.replace("'", "''").replace("%", "%%")
            chunks.append({'id': i, 'chunk': escaped_chunk, 'title': title, 'rights': rights})
            i=i+1

    return chunks
    
chunks = chunk_articles(articles, 255)
# chunks

## Load vector embeddings from watsonx.ai

In [None]:
from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.foundation_models import Embeddings
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames as EmbedParams
from ibm_watsonx_ai.foundation_models.utils.enums import EmbeddingTypes

embed_params = {
     EmbedParams.TRUNCATE_INPUT_TOKENS: 3,
     EmbedParams.RETURN_OPTIONS: {
     'input_text': True
     }
 }

embedding = Embeddings(
     model_id=EmbeddingTypes.IBM_SLATE_30M_ENG,
     params=embed_params,
     credentials=Credentials(
         api_key = api_key,
         url = ibm_cloud_url),
     project_id=project.project_context.projectID
     )

In [None]:
# extract text + titles
article_chunks = [item['chunk'] for item in chunks]
article_titles = [item['title'] for item in chunks]
article_rights = [item['rights'] for item in chunks]

# create vector embeddings for chunks
article_vectors = embedding.embed_documents(texts=article_chunks)

## Store data in Milvus

In [None]:
basic_collection = Collection(collection_name) 
data = [
    article_chunks,
    article_titles,
    article_rights,
    article_vectors
]

In [None]:
out = basic_collection.insert(data)
basic_collection.flush()  # Ensures data persistence

In [None]:
basic_collection = Collection(collection_name) 
basic_collection.num_entities 

## Load Milvus collection for search and filtering

In [None]:
basic_collection = Collection(collection_name)    
basic_collection.load(_filterable=True)

# Prepare functions for Milvus queries

In [None]:
# Query function
def query_milvus(query, num_results=3, filter_text=None):
    
    # Vectorize query
    query_embeddings = embedding.embed_documents(texts=[query])
    
    # Search
    search_params = {
        "metric_type": "L2", 
        "params": {"nprobe": 5}
    }

    if filter_text:
        expr = " or ".join([f'article_rights=="{role}"' for role in filter_text])
    else:
        expr = ""

    results = basic_collection.search(
        data=query_embeddings, 
        anns_field="vector", 
        param=search_params,
        limit=num_results,
        output_fields=['article_text'],
        expr=expr
    )
    return results

In [None]:
def get_relevant_chunks(results, verbose=False):
    
    relevant_chunks = []
    i=0

    for res in results[0]:
        text = res.entity.get('article_text')
        relevant_chunks.append(text)
        if verbose:
            print(f"Chunk #{i}:")
            print(f"  id: {res.id}")
            print(f"  distance: {res.distance}")
            print(f"  chunk: {text[:70]}...\n")
            i = i+1

    return relevant_chunks

## Test query in Milvus

In [None]:
num_results = 3

question_text = 'what are ways to cope with climate change?'
filter_text = ["economics", "climate"]

results = query_milvus(question_text, num_results, filter_text)

relevant_chunks = get_relevant_chunks(results, True)

# Prepare functions to create answers (with LLM)

## Load LLM from watsonx.ai for answer generation

In [None]:
def load_model(model_id):
    #        model_id='meta-llama/llama-2-70b-chat'
    #        model_id='mistralai/mixtral-8x7b-instruct-v01'

    from ibm_watsonx_ai.foundation_models import Model
    from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams

    creds = {
        "url": ibm_cloud_url,
        "apikey": api_key 
    }

    # Model Parameters
    params = {
        GenParams.DECODING_METHOD: "greedy",
        GenParams.MIN_NEW_TOKENS: 1,
        GenParams.MAX_NEW_TOKENS: 500,
        GenParams.TEMPERATURE: 0,
    }

    try:
        model = Model(model_id=model_id, 
            params=params, credentials=creds, 
            project_id=project.project_context.projectID
        )
        print(f"Model {model_id} loaded")
        return model
    except Exception as e:
        logger.error(f"load_model> error loading model: {str(e)}")
        print(f"load_model> error loading model: {str(e)}")

    return None

model = load_model(model_id='ibm/granite-3-2-8b-instruct')

In [None]:
def ask_llm(prompt, model):
    response = model.generate_text(prompt)
    return response

## Define a simple prompt for LLM

In [None]:
def set_prompt_template(new_template):
    from string import Template

    global prompt_template

    if new_template == '':
        prompt_template=Template("$context\n\nPlease answer a question using this text. "
          + "If the question is unanswerable, say \"unanswerable\"."
          + "\n\nQuestion: $question")
    else:
        prompt_template=Template(new_template)

    return(prompt_template)

set_prompt_template('')

In [None]:
def make_prompt(context, question):
    context = "\n\n".join(context)
    data={"context": context, "question": question}
    prompt = prompt_template.substitute(data)
    return prompt

# Generate answer with context from Milvus search 

In [None]:
prompt = make_prompt(relevant_chunks, question_text)

In [None]:
response = ask_llm(prompt, model)
print(response)

In [None]:
print(prompt)

# Test environment for experimentation

In [None]:
question_text = "what roles does co2 play in global warming?"
question_text = "what are the drivers for a healthy economics?"
filter_text = ["climate", "economics"] # None # "<your filter>"
num_results = 5

result = query_milvus(question_text, num_results, filter_text)
prompt = make_prompt(get_relevant_chunks(result, True), question_text)

response = ask_llm(prompt, model)
print(response)