## Lab 4 - Cross-encoder re-ranking

In [4]:
# from helper_utils import load_chroma, word_wrap, project_embeddings
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import numpy as np
import chromadb
import tqdm
import numpy as np
from openai import OpenAI
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

In [5]:
reader = PdfReader("documents/microsoft_annual_report_2022.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

print(pdf_texts[0])

1 Dear shareholders, colleagues, customers, and partners:  
We are living through a period of historic economic, societal, and geopolitical change. The world in 2022 looks nothing like 
the world in 2019. As I write this, inflation is at a 40 -year high, supply chains are stretched, and the war in Ukraine is 
ongoing. At the same time, we are entering a technological era with the potential to power awesome advancements 
across every sector of our economy and society. As the world’s largest software company, this places us at a historic 
intersection of opportunity and responsibility to the world around us.  
Our mission to empower every person and every organization on the planet to achieve more has never been more 
urgent or more necessary. For all the uncertainty in the world, one thing is clear: People and organizations in every 
industry are increasingly looking to digital technology to overcome today’s challenges and emerge stronger. And no 
company is better positioned to help th

In [9]:
# Split the text using RecursiveCharacterTextSplitter
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

print(character_split_texts[10])
print(f"\nTotal chunks: {len(character_split_texts)}")

# Resplit the chunks using SentenceTransformersTokenTextSplitter
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)
token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

print(token_split_texts[10])
print(f"\nTotal chunks: {len(token_split_texts)}")

# Embedding function
embedding_function = SentenceTransformerEmbeddingFunction()
print(embedding_function([token_split_texts[10]]))

# Create Chroma client and collection
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("microsoft_annual_report_2022_cross_encoder1", embedding_function=embedding_function)

# Prepare IDs
ids = [str(i) for i in range(len(token_split_texts))]

# Split data into batches
batch_size = 166  # Chosen to be under the maximum allowed batch size
for i in range(0, len(token_split_texts), batch_size):
    batch_ids = ids[i:i + batch_size]
    batch_texts = token_split_texts[i:i + batch_size]
    chroma_collection.add(ids=batch_ids, documents=batch_texts)

# Verify the number of documents in the collection
print(chroma_collection.count())

increased, due in large part to significant global datacenter expansions and the growth in Xbox sales and usage. Despite 
these increases, we remain dedicated to achieving a net -zero future. We recognize that progress won’t always be linear, 
and the rate at which we can implement emissions reductions is dependent on many factors that can fluctuate over time.  
On the path to becoming water positive, we invested in 21 water replenishment projects that are expected to generate 
over 1.3  million cubic meters of volumetric benefits in nine water basins around the world. Progress toward our zero waste 
commitment included diverting more than 15,200 metric tons of solid waste otherwise headed to landfills and incinerators, 
as well as launching new Circular Centers to increase reuse and reduce e -waste at our datacenters.  
We contracted to protect over 17,000 acres of land (50% more than the land we use to operate), thus achieving our

Total chunks: 347
increased, due in large part to si

# Re-ranking the long tail

In [10]:
query = "What has been the investment in research and development?"
results = chroma_collection.query(query_texts=query, n_results=10, include=['documents', 'embeddings'])

retrieved_documents = results['documents'][0]

In [11]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

config.json: 100%|██████████| 794/794 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:31<00:00, 2.86MB/s]
tokenizer_config.json: 100%|██████████| 316/316 [00:00<?, ?B/s] 
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.08MB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<?, ?B/s] 


In [12]:
pairs = [[query, doc] for doc in retrieved_documents]
scores = cross_encoder.predict(pairs)
print("Scores:")
for score in scores:
    print(score)

Scores:
0.9869337
2.644577
-0.2680306
-5.6469984
-4.297035
-10.933233
-8.666395
-7.0384293
-7.3246956
-4.639378


In [13]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o+1)

New Ordering:
2
1
3
5
10
4
8
9
7
6


# Re-ranking with Query Expansion

In [14]:
original_query = "What were the most important factors that contributed to increases in revenue?"
generated_queries = [
    "What were the major drivers of revenue growth?",
    "Were there any new product launches that contributed to the increase in revenue?",
    "Did any changes in pricing or promotions impact the revenue growth?",
    "What were the key market trends that facilitated the increase in revenue?",
    "Did any acquisitions or partnerships contribute to the revenue growth?"
]

In [15]:
queries = [original_query] + generated_queries

results = chroma_collection.query(query_texts=queries, n_results=10, include=['documents', 'embeddings'])
retrieved_documents = results['documents']

In [16]:
# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

unique_documents = list(unique_documents)

In [17]:
pairs = []
for doc in unique_documents:
    pairs.append([original_query, doc])

In [18]:
scores = cross_encoder.predict(pairs)


In [19]:
print("Scores:")
for score in scores:
    print(score)

Scores:
-4.6518927
-6.9020905
-7.490654
-3.7948647
-4.3417673
-9.80788
-11.0792675
-4.818485
-10.000139
-9.357723
-5.274749
-7.9171767
-8.505106
-7.754099
-10.0839405
-10.711212
-1.1369958
-9.918428
-3.7681546
-10.148884
-10.042842


In [20]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o)

New Ordering:
16
18
3
4
0
7
10
1
2
13
11
12
9
5
17
8
20
14
19
15
6
