# Cross-encoder re-ranking

## Installation

In [17]:
%pip install -q -r requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


## Connecting to the vector database

In [18]:
import chromadb
from helper_utils import load_chroma, word_wrap, project_embeddings
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings
import numpy as np

In [19]:
chroma_client = chromadb.PersistentClient(
    path="data/chroma_db/",
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)
# Load the existing collection by its name
collection_name = 'microsoft_annual_report_2022'
chroma_collection = chroma_client.get_or_create_collection(name=collection_name)

# Count the number of items in the collection
count = chroma_collection.count()
print(f"Number of items in the collection '{collection_name}': {count}")

Number of items in the collection 'microsoft_annual_report_2022': 349


## Setup the embedding function

In [20]:
# Access the underlying SentenceTransformer model (Defaults)
embedding_function = SentenceTransformerEmbeddingFunction()
model = embedding_function.models
print(model)

{'all-MiniLM-L6-v2': SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)}


## Helper to print the retrieved results

In [21]:
def print_results_and_documents(results, retrieved_documents, word_wrap):
    """
    Prints keys and values from the results dictionary and documents with word wrapping.

    Args:
        results (dict): A dictionary where keys are strings and values are either strings or lists.
        retrieved_documents (list): A list of documents to be printed.
        word_wrap (function): A function to apply word wrapping to the documents.

    Returns:
        None
    """
    # Iterate through the dictionary and print each key with its associated value
    for key, value in results.items():
        print(f"{key}:")

        # Check if the value is a list and print its elements
        if isinstance(value, list):
            for i, item in enumerate(value):
                print(f"  Item {i+1}: {item}")
        else:
            # Directly print the value if it's not a list
            print(f"  {value}")

        print()  # Add a newline for better readability

    # Iterate through the list of documents and print each one with word wrapping
    #for document in retrieved_documents:
    #    print(word_wrap(document))
    #    print('\n')

## Retrieving the results

In [22]:
query = "What has been the investment in research and development?"

results = chroma_collection.query(query_texts=query,
                                   n_results=10, 
                                   include=['documents', 'embeddings', "distances"])

retrieved_documents = results['documents'][0]
print_results_and_documents(results, retrieved_documents, word_wrap)

#for document in results['documents'][0]:
#    print(word_wrap(document))
#    print('')

ids:
  Item 1: ['150', '111', '110', '3', '63', '109', '225', '162', '108', '183']

distances:
  Item 1: [0.5315508246421814, 0.5435552597045898, 0.5674221515655518, 0.5745933055877686, 0.5864145159721375, 0.5975274443626404, 0.6010360717773438, 0.632536768913269, 0.6414855122566223, 0.66075599193573]

metadatas:
  None

embeddings:
  Item 1: [[0.03270440548658371, -0.01129379216581583, 0.021939339116215706, 0.01729484833776951, 0.06328116357326508, -0.008777561597526073, 0.03167689964175224, 0.04501157999038696, 0.03704705089330673, 0.06764397025108337, -0.05058981105685234, -0.02825894206762314, 0.0025165919214487076, -0.03224783018231392, -0.03239731118083, -0.005727757699787617, 0.008137194439768791, -0.0759151428937912, -0.05086909979581833, -0.02621528133749962, 0.0008599798893555999, -0.03174488991498947, 0.04896758496761322, -0.016084911301732063, 0.05717905983328819, -0.02037852071225643, 0.008073949255049229, -0.022060677409172058, 0.004389331676065922, 0.06001037359237671, -

Note:   
What were doing is asking for more results (10) so instead of just getting nearest neighbors we are getting a long tail.

## Setting up a cross_encoder

Sentence encoders are made up of two kinds of models (bi-encoders and cross-encoders)   
  
bi-encoders give us similaries (L2 and Cosine)   (e,g, all-MiniLM-L6-v2)   
>  bi-encoders do a nearest neighbor with the query against all the documents  
>  The query and the document (vector database) embeddings are callcuated independantly      

cross-encoders give us a score via a classifier  (e.g, ms-marco-MiniLM-L-6-v2)        
>  cross-encoders compare the query to each document and return a score   
>  The query and chunck embeddings are calculated at the same time    
>  The default is cosine simialrity   

<https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2>

In [23]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

## Calculating the scores

In [24]:
pairs = [[query, doc] for doc in retrieved_documents]
scores = cross_encoder.predict(pairs)
print("Scores:")
for score in scores:
    print(score)

Scores:
0.9869351
2.6445756
-0.2680304
-10.731591
-7.706607
-5.6469984
-4.297037
-8.666395
-7.038429
-4.6393795


## Re-ranking the results based on the scores

In [25]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o+1)

New Ordering:
2
1
3
7
10
6
9
5
8
4


Note:   
  Documents 2 ans 1 are swapped   
  Documents 7 and 6 are not in the top 5   
  Documents 5 and 4 are 8 and 9 position   
  Now if we process the top 5 we get a long tail with more relavant information.



# Re-ranking with Query Expansion

In [26]:
original_query = "What were the most important factors that contributed to increases in revenue?"
generated_queries = [
    "What were the major drivers of revenue growth?",
    "Were there any new product launches that contributed to the increase in revenue?",
    "Did any changes in pricing or promotions impact the revenue growth?",
    "What were the key market trends that facilitated the increase in revenue?",
    "Did any acquisitions or partnerships contribute to the revenue growth?"
]

Note: The combined queries will ask six questions and get six 10x chuncks each (60 documents)

In [27]:
queries = [original_query] + generated_queries

results = chroma_collection.query(query_texts=queries,
                                   n_results=10, 
                                   include=['documents', 'embeddings', "distances"])

retrieved_documents = results['documents'][0]
print_results_and_documents(results, retrieved_documents, word_wrap)


ids:
  Item 1: ['143', '166', '152', '209', '148', '149', '147', '151', '141', '283']
  Item 2: ['143', '320', '152', '147', '148', '209', '145', '319', '144', '293']
  Item 3: ['145', '127', '209', '149', '148', '321', '139', '293', '141', '188']
  Item 4: ['151', '145', '148', '149', '127', '143', '141', '147', '293', '331']
  Item 5: ['149', '148', '145', '143', '166', '151', '141', '147', '293', '319']
  Item 6: ['143', '152', '145', '149', '148', '127', '194', '262', '209', '146']

distances:
  Item 1: [0.4731292724609375, 0.5255155563354492, 0.5317227840423584, 0.5491454601287842, 0.5512256622314453, 0.5530146360397339, 0.5624922513961792, 0.5652060508728027, 0.5659996867179871, 0.5682674646377563]
  Item 2: [0.5023680329322815, 0.5264345264464799, 0.5311307907104492, 0.5381581783294678, 0.5400921106338501, 0.5418443083763123, 0.5418799519538879, 0.5443226337907896, 0.5446497201919556, 0.544989824295044]
  Item 3: [0.43821030855178833, 0.4548625349998474, 0.4607875347137451, 0.46

## Re-ranking the long tail

In [28]:
# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

unique_documents = list(unique_documents)
print(len(unique_documents)," Unique Documents")

48  Unique Documents


In [29]:
pairs = []
for doc in unique_documents:
    pairs.append([original_query, doc])

In [30]:
scores = cross_encoder.predict(pairs)


In [31]:
print("Scores:")
for score in scores:
    print(score)

Scores:
-8.111134
-8.859443
-7.4291644
-6.9356394
-8.152822
-9.072263
-8.321934
-7.452135
-8.7719755
-8.125717
-7.29047
-8.520399
-8.321728
-8.000093
-7.371804
-9.152418
-7.09365
-8.204197
-8.700953
-8.777537
-9.139927
-8.603168
-8.266382
-9.0392685
-8.733776
-8.415419
-8.039277
-8.212904
-8.442551
-8.759289
-8.897444
-8.211146
-8.74255
-8.928579
-8.078913
-7.9583635
-8.831294
-8.394537
-5.9334054
-7.6008215
-8.338874
-8.305859
-6.7395096
-7.7317786
-8.02363
-8.560882
-8.618932
-9.103207


In [32]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o)

New Ordering:
38
42
3
16
10
14
2
7
39
43
35
13
44
26
34
0
9
4
17
31
27
22
41
12
6
40
37
25
28
11
45
21
46
18
24
32
29
8
19
36
1
30
33
23
5
47
20
15
