In [1]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import Document

# LLM
# from llama_index.llms import Anthropic

# Embeddings
from llama_index.embeddings import HuggingFaceEmbedding

# Retrievers
from llama_index.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
)

# Rerankers
from llama_index.indices.query.schema import QueryBundle, QueryType
from llama_index.schema import NodeWithScore
# from llama_index.indices.postprocessor.cohere_rerank import CohereRerank
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.finetuning.embeddings.common import EmbeddingQAFinetuneDataset

# Evaluator
from llama_index.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.evaluation import RetrieverEvaluator

from typing import List
import pandas as pd

import nest_asyncio

nest_asyncio.apply()

In [10]:
# Define all embeddings and rerankers
EMBEDDINGS = {
    "bge-large": HuggingFaceEmbedding(model_name='BAAI/bge-large-en'), # You can use mean pooling by addin pooling='mean' parameter
    # "JinaAI-Small": HuggingFaceEmbedding(model_name='jinaai/jina-embeddings-v2-small-en', pooling='mean', trust_remote_code=True), # gated repo
    # "JinaAI-Base": HuggingFaceEmbedding(model_name='jinaai/jina-embeddings-v2-base-en', pooling='mean', trust_remote_code=True), # gated repo
    "uae-large": HuggingFaceEmbedding(model_name='WhereIsAI/UAE-Large-V1'),
    
}

RERANKERS = {
    "WithoutReranker": "None",
    "bge-reranker-base": SentenceTransformerRerank(model="BAAI/bge-reranker-base", top_n=5),
    "bge-reranker-large": SentenceTransformerRerank(model="BAAI/bge-reranker-large", top_n=5),
    "jina-colbert-v1-en": SentenceTransformerRerank(model='jinaai/jina-colbert-v1-en', top_n=5)
}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at jinaai/jina-colbert-v1-en and are newly initialized: ['bert.encoder.layer.11.output.LayerNorm.weight', 'bert.encoder.layer.8.output.dense.bias', 'bert.encoder.layer.7.output.dense.weight', 'bert.encoder.layer.8.output.LayerNorm.weight', 'bert.encoder.layer.3.output.dense.weight', 'bert.encoder.layer.3.intermediate.dense.bias', 'bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.6.output.LayerNorm.bias', 'bert.encoder.layer.7.output.LayerNorm.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.1.intermediate.dense.bias', 'bert.encoder.layer.9.intermediate.dense.bias', 'bert.encoder.layer.6.intermediate.dense.weight', 'bert.encoder.layer.8.output.LayerNorm.bias', 'bert.encoder.layer.5.intermediate.dense.bias', 'bert.encoder.layer.5.output.dense.bias', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder

### ms-marco-200 dataset

In [11]:
from datasets import load_dataset

dataset = load_dataset("./data/in/") # ms-marco-200-rows.csv
dataset['train'][0]

{'question': 'walgreens store sales average',
 'contexts': "['The average Walgreens salary ranges from approximately $15,000 per year for Customer Service Associate / Cashier to $179,900 per year for District Manager. Average Walgreens hourly pay ranges from approximately $7.35 per hour for Laboratory Technician to $68.90 per hour for Pharmacy Manager. Salary information comes from 7,810 data points collected directly from employees, users, and jobs on Indeed.'\n 'The average revenue in 2011 of a Starbuck Store was $1,078,000, up  from $1,011,000 in 2010.    The average ticket (total purchase) at domestic Starbuck stores in  No … vember 2007 was reported at $6.36.    In 2008, the average ticket was flat (0.0% change).'\n 'In fiscal 2014, Walgreens opened a total of 184 new locations and acquired 84 locations, for a net decrease of 273 after relocations and closings. How big are your stores? The average size for a typical Walgreens is about 14,500 square feet and the sales floor average

In [12]:
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)

In [13]:
import ast

corpus = []
filtered_queries = []
cnt = 0
for train_row in dataset["train"]:
    context = ast.literal_eval(train_row["contexts"])[0]
    # ground_truth = ast.literal_eval(train_row["ground_truths"])[0]
    if len(context.strip()) == 0:
        # cnt += 1
        continue
    current_document = Document(text=context)
    # print(current_document)
    # break
    print(node_parser.get_nodes_from_documents([current_document]))
    if len(node_parser.get_nodes_from_documents([current_document])) == 1:
        corpus.append(context)
        filtered_queries.append(train_row["question"])
        cnt += 1

corpus[:2], filtered_queries[:2], cnt

[TextNode(id_='ee389dbd-0ffd-4fb3-90f8-627addb804c5', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='09c6a8c8-8e79-4ab7-a9f6-b1b7ac36c637', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='5ce2f810221dfbb0fcbc88cab973f4a306ceac4174bc25c4334e7c597a984ee8')}, text='The average Walgreens salary ranges from approximately $15,000 per year for Customer Service Associate / Cashier to $179,900 per year for District Manager. Average Walgreens hourly pay ranges from approximately $7.35 per hour for Laboratory Technician to $68.90 per hour for Pharmacy Manager. Salary information comes from 7,810 data points collected directly from employees, users, and jobs on Indeed.The average revenue in 2011 of a Starbuck Store was $1,078,000, up  from $1,011,000 in 2010.    The average ticket (total purchase) at domestic Starbuck stores in  No … vember 2007 was reported at $6.36.    I

(['The average Walgreens salary ranges from approximately $15,000 per year for Customer Service Associate / Cashier to $179,900 per year for District Manager. Average Walgreens hourly pay ranges from approximately $7.35 per hour for Laboratory Technician to $68.90 per hour for Pharmacy Manager. Salary information comes from 7,810 data points collected directly from employees, users, and jobs on Indeed.The average revenue in 2011 of a Starbuck Store was $1,078,000, up  from $1,011,000 in 2010.    The average ticket (total purchase) at domestic Starbuck stores in  No … vember 2007 was reported at $6.36.    In 2008, the average ticket was flat (0.0% change).In fiscal 2014, Walgreens opened a total of 184 new locations and acquired 84 locations, for a net decrease of 273 after relocations and closings. How big are your stores? The average size for a typical Walgreens is about 14,500 square feet and the sales floor averages about 11,000 square feet. How do we select locations for new stores

In [14]:
len(corpus), len(filtered_queries), len(dataset['train'])

(43, 43, 200)

In [15]:
documents = [Document(text=c) for c in corpus]
nodes = node_parser.get_nodes_from_documents(documents)
for idx, node in enumerate(nodes):
    node.id_ = f"corpus_{idx}"

In [16]:
nodes[0]

TextNode(id_='corpus_0', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='5829623f-64de-4435-ac25-70c6862b714c', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='5ce2f810221dfbb0fcbc88cab973f4a306ceac4174bc25c4334e7c597a984ee8'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='9bc2c93d-1500-4eda-b18e-088f8e529bee', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='db3c17dffc0ea1936ce6b04d046dc5462c67899aea8d32b0a0b2a2aa278522be')}, text='The average Walgreens salary ranges from approximately $15,000 per year for Customer Service Associate / Cashier to $179,900 per year for District Manager. Average Walgreens hourly pay ranges from approximately $7.35 per hour for Laboratory Technician to $68.90 per hour for Pharmacy Manager. Salary information comes from 7,810 data points collected directly from employees, users, and jobs on Indeed.The average revenue in

In [17]:
queries_dict = {f"query_{index}":filtered_queries[index] for index in range(cnt)}
corpus_dict = {f"corpus_{index}":corpus[index] for index in range(cnt)}
relevant_docs_dict = {f"query_{index}":[f"corpus_{index}"] for index in range(cnt)}

In [18]:
qa_dataset_ms_marco = EmbeddingQAFinetuneDataset(
    queries=queries_dict,
    corpus=corpus_dict,
    relevant_docs=relevant_docs_dict
)

In [19]:
qa_dataset_ms_marco

EmbeddingQAFinetuneDataset(queries={'query_0': 'walgreens store sales average', 'query_1': 'cost to frame basement', 'query_2': 'why is albumin normally absent in urine', 'query_3': 'What Does Lop means', 'query_4': 'how much do tuna fishers first mates make', 'query_5': 'what is treatment for jaundice', 'query_6': 'what is a purified phytochemical', 'query_7': 'how long can you keep chicken salad in your fridge', 'query_8': 'how long does it take to boil beets', 'query_9': 'hakan what does oil do', 'query_10': 'lakes around michigan', 'query_11': 'what is an example of a psychological need', 'query_12': 'where do roseate spoonbills live', 'query_13': 'meaning of the name darcy', 'query_14': 'how tall is a redwood tree', 'query_15': 'where is popocatepetl located', 'query_16': 'which phase of mitosis does dna condense into chromosomes', 'query_17': 'how to keep house plants from becoming leggy', 'query_18': 'unit charge definition', 'query_19': 'how long is a city block in chicago', 'q

In [21]:
def display_results(embedding_name, reranker_name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"Embedding": [embedding_name], "Reranker": [reranker_name], "hit_rate": [hit_rate], "mrr": [mrr]}
    )

    return metric_df

In [25]:
results_df = pd.DataFrame()

# Loop over embeddings
for embed_name, embed_model in EMBEDDINGS.items():

    print(f"Running Evaluation for Embedding Model: {embed_name}")

    service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model)
    vector_index = VectorStoreIndex(nodes, service_context=service_context)

    vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5, service_context=service_context)

    # Loop over rerankers
    for rerank_name, reranker in RERANKERS.items():

        print(f"Running Evaluation for Embedding Model: {embed_name} and Reranker: {rerank_name}")

        # Define Retriever
        class CustomRetriever(BaseRetriever):
            """Custom retriever that performs both Vector search and Knowledge Graph search"""

            def __init__(
                self,
                vector_retriever: VectorIndexRetriever,
            ) -> None:
                """Init params."""

                self._vector_retriever = vector_retriever

            def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
                """Retrieve nodes given query."""

                retrieved_nodes = self._vector_retriever.retrieve(query_bundle)

                if reranker != 'None':
                    retrieved_nodes = reranker.postprocess_nodes(retrieved_nodes, query_bundle)
                else:
                    retrieved_nodes = retrieved_nodes[:5]

                return retrieved_nodes

            async def _aretrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
                """Asynchronously retrieve nodes given query.

                Implemented by the user.

                """
                return self._retrieve(query_bundle)

            async def aretrieve(self, str_or_query_bundle: QueryType) -> List[NodeWithScore]:
                if isinstance(str_or_query_bundle, str):
                    str_or_query_bundle = QueryBundle(str_or_query_bundle)
                return await self._aretrieve(str_or_query_bundle)

        custom_retriever = CustomRetriever(vector_retriever)

        retriever_evaluator = RetrieverEvaluator.from_metric_names(
            ["mrr", "hit_rate"], retriever=custom_retriever
        )
        eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset_ms_marco)

        current_df = display_results(embed_name, rerank_name, eval_results)
        results_df = pd.concat([results_df, current_df], ignore_index=True)

Running Evaluation for Embedding Model: bge-large
LLM is explicitly disabled. Using MockLLM.
Running Evaluation for Embedding Model: bge-large and Reranker: WithoutReranker
Running Evaluation for Embedding Model: bge-large and Reranker: bge-reranker-base
Running Evaluation for Embedding Model: bge-large and Reranker: bge-reranker-large
Running Evaluation for Embedding Model: bge-large and Reranker: jina-colbert-v1-en
Running Evaluation for Embedding Model: uae-large
LLM is explicitly disabled. Using MockLLM.
Running Evaluation for Embedding Model: uae-large and Reranker: WithoutReranker
Running Evaluation for Embedding Model: uae-large and Reranker: bge-reranker-base
Running Evaluation for Embedding Model: uae-large and Reranker: bge-reranker-large
Running Evaluation for Embedding Model: uae-large and Reranker: jina-colbert-v1-en


In [26]:
print(results_df)

   Embedding            Reranker  hit_rate       mrr
0  bge-large     WithoutReranker       1.0  1.000000
1  bge-large   bge-reranker-base       1.0  1.000000
2  bge-large  bge-reranker-large       1.0  1.000000
3  bge-large  jina-colbert-v1-en       1.0  0.499612
4  uae-large     WithoutReranker       1.0  1.000000
5  uae-large   bge-reranker-base       1.0  1.000000
6  uae-large  bge-reranker-large       1.0  1.000000
7  uae-large  jina-colbert-v1-en       1.0  0.500388


#### 

### with contextual compression

In [20]:
from typing import Any, Optional
from uuid import UUID

from dotenv import load_dotenv

from langchain.callbacks.base import BaseCallbackHandler

from genai import Client, Credentials
from genai.extensions.langchain import LangChainInterface
from genai.text.generation import (
    DecodingMethod,
    ModerationHAP,
    ModerationParameters,
    TextGenerationParameters,
)

import warnings
warnings.filterwarnings("ignore")

# make sure you have a .env file under genai root with
# GENAI_KEY=<your-genai-key>
# GENAI_API=<genai-api-endpoint> (optional) DEFAULT_API = "https://bam-api.res.ibm.com"
load_dotenv()


def heading(text: str) -> str:
    """Helper function for centering text."""
    return "\n" + f" {text} ".center(80, "=") + "\n"


print(heading("Generate text with langchain"))


class Callback(BaseCallbackHandler):
    def on_llm_new_token(
        self,
        token: str,
        *,
        run_id: UUID,
        parent_run_id: Optional[UUID] = None,
        **kwargs: Any,
    ) -> Any:
        print(f"Token received: {token}")


def bam_model(model_id='mistralai/mistral-7b-instruct-v0-2', decoding_method='greedy', max_new_tokens=200, 
              min_new_tokens=1, temperature=0.5, top_k=50, top_p=1, repetition_penalty=1):

    if decoding_method == 'greedy':
        decoding_method = DecodingMethod.GREEDY
    else:
        decoding_method = DecodingMethod.SAMPLE

    llm = LangChainInterface(
        # model_id="ibm/granite-13b-chat-v2",
        model_id=model_id,
        client=Client(credentials=Credentials.from_env()),
        parameters=TextGenerationParameters(
            decoding_method=decoding_method,
            max_new_tokens=max_new_tokens,
            min_new_tokens=min_new_tokens,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            repetition_penalty=repetition_penalty
        ),
        moderations=ModerationParameters(
            # Threshold is set to very low level to flag everything (testing purposes)
            # or set to True to enable HAP with default settings
            hap=ModerationHAP(input=True, output=True, threshold=0.01)
        ),
    )

    return llm





  from genai.text.generation import (
  from genai.text.generation import (
  from genai.text.generation import (
  from genai.text.generation import (


- LLMChainExtractor

In [22]:
embed_model = HuggingFaceEmbedding(model_name='BAAI/bge-large-en')
reranker = SentenceTransformerRerank(model="BAAI/bge-reranker-large", top_n=5)

In [23]:
service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model)
vector_index = VectorStoreIndex(nodes, service_context=service_context)

vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5, service_context=service_context)

LLM is explicitly disabled. Using MockLLM.


In [24]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

llm = bam_model()
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=vector_retriever
)

ValidationError: 1 validation error for ContextualCompressionRetriever
base_retriever
  value is not a valid dict (type=type_error.dict)

In [None]:
compressed_docs = compression_retriever.get_relevant_documents(
    "What Does Lop means"
)
compressed_docs