In [1]:
%pip install python-dotenv
%pip install litellm
%pip install -qU trulens_eval pydantic fastapi kaleido python-multipart uvicorn cohere openai tiktoken "llama-index"
%pip install transformers
%pip install sentence-transformers
%pip install pinecone-client
%pip install datasets
%pip install accelerate
%pip install einops
%pip install langchain
%pip install xformers
%pip install bitsandbytes
%pip install matplotlib seaborn tqdm
%pip install chromadb
%pip install evaluate
%pip install rouge_score
%pip install bert_score



In [2]:
import os
from dotenv import load_dotenv

# load environment variables from .env file
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [3]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

## Data Preparation

In [4]:
import json

file_path = "pubmed_intelligence.json"
with open(file_path, "r", encoding="utf-8") as file:
    docs = json.load(file)

len(docs)

58730

In [12]:
# docs = docs[:1000]

# print(docs[0].keys())
# print(len(docs))

dict_keys(['PMID', 'Title', 'Abstract', 'Keywords', 'Authors', 'PubDateEDAT', 'ArticleDate', 'Journal'])
1000


In [5]:
from transformers import LlamaTokenizer
hf_auth = os.environ.get('HF_AUTH')
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf",use_auth_token=hf_auth)



In [9]:
tokenizer.save_pretrained('./tokenizer')

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/tokenizer.model',
 './tokenizer/added_tokens.json')

In [6]:
def token_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

In [7]:
token_counts = [token_len(doc['Abstract']) for doc in docs]
min_tokens=min(token_counts)
avg_tokens=int(sum(token_counts) / len(token_counts))
max_tokens=max(token_counts)

print(f"""Min: {min_tokens}
Avg: {avg_tokens}
Max: {max_tokens}""")

Min: 1
Avg: 365
Max: 18575


In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
    length_function=token_len,
    separators=['\n\n', '\n', ' ', '']
)

In [9]:
chunks = text_splitter.split_text(docs[100]['Abstract'])
print(len(chunks))
print(token_len(chunks[0]))

3
109


In [10]:
from tqdm.auto import tqdm

documents = []

for doc in tqdm(docs):
    uid = doc['PMID']
    chunks = text_splitter.split_text(doc['Abstract'])
    for i, chunk in enumerate(chunks):
        documents.append({
            'id': f'{uid}-{i}',
            'text': chunk,
            'source': doc
        })
len(documents)

  0%|          | 0/58730 [00:00<?, ?it/s]

258608

In [11]:
import pandas as pd
data = pd.DataFrame(documents)
data.head()

Unnamed: 0,id,text,source
0,26665339-0,"In the past, the data hospitals gleaned from o...","{'PMID': '26665339', 'Title': 'Using periopera..."
1,26665702-0,Traditional management skills are still crucia...,"{'PMID': '26665702', 'Title': 'THE NEW CEO.', ..."
2,26667848-0,PURPOSE: Increased evidence of subnormal neuro...,"{'PMID': '26667848', 'Title': 'Neuropsychologi..."
3,26667848-1,"in learning. Additionally, we enrolled drug-na...","{'PMID': '26667848', 'Title': 'Neuropsychologi..."
4,26667848-2,"fifteen healthy children (mean age: 9.2years, ...","{'PMID': '26667848', 'Title': 'Neuropsychologi..."


## Document Embedding Pipeline


In [12]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os
import pinecone
from tqdm import tqdm

In [21]:
device_used = cuda.get_device_name(0) if cuda.is_available() else "CPU"
print("Device used:", device_used)

Device used: NVIDIA A100-SXM4-40GB


In [13]:
embedding_model = 'sentence-transformers/all-MiniLM-L6-v2'
device = 'cuda:0'

embed_model = HuggingFaceEmbeddings(
    model_name=embedding_model,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Embed the example documents using the model you created and check the output.
The output should be a list of lists, containing the embeddings.

In [14]:
example_docs = [
    "An example document",
    "A second document as an example"
]
embeddings = embed_model.embed_documents(example_docs)
print("number of docs:",len(embeddings))
print("dimension of docs:",len(embeddings[0]))

number of docs: 2
dimension of docs: 384


Now we use the embedding pipeline created above to store the embeddings in a Pinecone vector index. First, lets setup the Pinecone environment, collect your API key and environment name from the environment variables, and initiate Pinecone with them.

In [18]:
import pinecone

pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_environment = os.getenv("PINECONE_ENVIRONMENT")

pinecone = pinecone.Pinecone(api_key=pinecone_api_key)

Initialize the index `rag-assignment` inside Pinecone. Use the cosine similarity as similarity metric. Keep in mind that if you run this multiple times on a free tier, where only one index is allowed, you need to remove the index created to make room for a new one (Pinecone index gets archived automatically after 14 days of inactivity).

In [20]:
index_name = 'rag-assignment'

# for index in pinecone.list_indexes():
#     if index['name']==index_name:
#         pinecone.delete_index(index['name'])

pinecone.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec={'pod': {'environment': pinecone_environment,
                  'pod_type': 'starter',
                  'pods': 1,
                  'replicas': 1,
                  'shards': 1
                  }
          },
)

Lets take a look at the index you created. As of now the index should be empty but have the correct embedding dimension.

In [21]:
index_name = 'rag-assignment'
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [24]:
batch_size = 32

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    batch = data.iloc[i:i_end]
    ids = [f"{x['id']}" for i, x in batch.iterrows()]
    texts = [x['text'] for i, x in batch.iterrows()]
    embeds = embed_model.embed_documents(texts)

    metadata = [
        {'source':x['text']}
        for i, x in batch.iterrows()
    ]
    # metadata = [
    #     { 'pmid': x['source']['PMID'],
    #       'title': x['source']['Title'],
    #       'abstract': x['source']['Abstract'],
    #       'authors': x['source']['Authors'],
    #       'artical_date': x['source']['ArticleDate'],
    #       'journal': x['source']['Journal']}
    #     for i, x in batch.iterrows()
    # ]
    index.upsert(vectors=zip(ids, embeds, metadata))



 39%|███▉      | 3171/8082 [14:47<22:54,  3.57it/s]


PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'content-type': 'application/json', 'Content-Length': '133', 'x-pinecone-request-latency-ms': '64', 'x-pinecone-request-id': '2393092346762024455', 'date': 'Sun, 03 Mar 2024 00:49:50 GMT', 'x-envoy-upstream-service-time': '33', 'server': 'envoy', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"code":9,"message":"Starter index record limit reached. Current number: 100960, records in request: 32, limit: 100000","details":[]}


Now if we look at the index statistics we should have vectors of dimension `384`.

In [25]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 1.0,
 'namespaces': {'': {'vector_count': 101472}},
 'total_vector_count': 101472}

## Text Generation Pipeline


In [26]:
from torch import cuda, bfloat16
import os
import transformers
model_id = 'meta-llama/Llama-2-13b-chat-hf'

In [27]:
bitsAndBites_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

In [28]:

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=os.environ.get('HF_AUTH')
)



In [29]:

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bitsAndBites_config,
    device_map='auto',
    token=os.environ.get('HF_AUTH')
)
model.eval()
print(f"Model loaded ")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded 


In [24]:
model.save_pretrained('./model')

You can even check the memory footprint of your model using the `get_memory_footprint` method.


In [30]:
model.get_memory_footprint()

7100747776

In [31]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    temperature=0.01,
    max_new_tokens=512,
    repetition_penalty=1.1
)

In [33]:
from langchain.llms import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=generate_text)

In [34]:
query = 'What is the focus regarding university dropout rates, particularly in the field of Health Sciences?'
llm(prompt=query)

  warn_deprecated(


"\nA: University dropout rates are a significant concern in higher education, and this is especially true in the field of Health Sciences. The focus on dropout rates in Health Sciences is due to several reasons:\n\n1. High stakes: Health Sciences programs are often highly competitive and have high admission standards. Students who enroll in these programs may feel pressure to succeed and may be more likely to drop out if they encounter difficulties or struggle with the coursework.\n2. Cost: Health Sciences programs can be expensive, and students who drop out may leave with a significant amount of debt. This can be a financial burden that can impact their future career choices and opportunities.\n3. Limited seats: Many Health Sciences programs have limited seats, which means that there may be more qualified applicants than available spots. This can create a high-pressure environment for students who are accepted into these programs, and may contribute to higher dropout rates.\n4. Comple

## Question Answering Chain


In [40]:
query = 'What are the three successive steps involved in the novel mass detection process described for identifying breast abnormalities on mammographic images, according to the paper?'

In [49]:
# facet search
author = 'Hamrouni K'
start_date = '2013/01/01'
end_date = '2023/01/01'

# filtered_docs = [doc['Abstract'] for doc in docs if (author in doc['Authors']) and (doc['ArticleDate']>start_date and doc['ArticleDate']<end_date)]
filtered_docs = [doc['Abstract'] for doc in docs if (doc['ArticleDate']>start_date and doc['ArticleDate']<end_date)]
print(len(filtered_docs))

43093


In [50]:
from langchain.retrievers import BM25Retriever
bm25_retriever = BM25Retriever.from_texts(filtered_docs)

In [51]:
bm25_retriever.get_relevant_documents(query)

[Document(page_content='Female breast cancer is the second most common cancer in the world. Several efforts in artificial intelligence have been made to help improving the diagnostic accuracy at earlier stages. However, the identification of breast abnormalities, like masses, on mammographic images is not a trivial task, especially for dense breasts. In this paper we describe our novel mass detection process that includes three successive steps of enhancement, characterization and classification. The proposed enhancement system is based mainly on the analysis of the breast texture. First of all, a filtering step with morphological operators and soft thresholding is achieved. Then, we remove from the filtered breast region, all the details that may interfere with the eventual masses, including pectoral muscle and galactophorous tree. The pixels belonging to this tree will be interpolated and replaced by the average of the neighborhood. In the characterization process, measurement of the

In [38]:
from langchain.vectorstores import Chroma
### your code ###
vectordb = Chroma.from_texts(texts=list(data['text']), embedding=embed_model, persist_directory="chroma_db")

In [41]:
vectordb.similarity_search(
    query,  # the search query
    k=5,  # returns top 3 most relevant chunks of text
)

[Document(page_content='Female breast cancer is the second most common cancer in the world. Several efforts in artificial intelligence have been made to help improving the diagnostic accuracy at earlier stages. However, the identification of breast abnormalities, like masses, on mammographic images is not a trivial task, especially for dense breasts. In this paper we describe our novel mass detection process that includes three successive steps of enhancement, characterization and classification. The'),
 Document(page_content='for imaging the breast. In the first half of the 20th century, the diagnosis was in practice only clinical, with consequent diagnostic delay and an unfavorable prognosis in the short term. The rise of organized mammography screening has led to a remarkable reduction in mortality through the early detection of breast malignancies. This historical review aims to offer a complete panorama of the development of mammography and breast'),
 Document(page_content='PURPOS

In [46]:
from langchain.vectorstores import Pinecone
vectorstore = Pinecone(index, embed_model.embed_query, 'source')

In [47]:
vectorstore.similarity_search(
    query,  # the search query
    k=5,  # returns top 3 most relevant chunks of text
)

[Document(page_content='Female breast cancer is the second most common cancer in the world. Several efforts in artificial intelligence have been made to help improving the diagnostic accuracy at earlier stages. However, the identification of breast abnormalities, like masses, on mammographic images is not a trivial task, especially for dense breasts. In this paper we describe our novel mass detection process that includes three successive steps of enhancement, characterization and classification. The'),
 Document(page_content="criteria, accounting for the publication date and exclusive use of mammography images, and included only literature in English. After extracting data, results were compared and discussed. This review included 33 studies and identified four recurring categories of studies: the differentiation of benign and malignant masses, the localisation of masses, cancer-containing and cancer-free breast tissue differentiation and breast classification based on breast density.

In [52]:
from langchain.retrievers import EnsembleRetriever
# faiss_retriever = vectorstore.as_retriever(search_kwargs={"k":5,})
faiss_retriever = vectordb.as_retriever(search_kwargs={"k":5,})

# combine lexicographical search and semantic search
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever,faiss_retriever],
                                       weights=[0.5,0.5])


In [53]:
from langchain.chains import RetrievalQA

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    verbose=True,
    retriever=ensemble_retriever,
    chain_type_kwargs={
        "verbose": True },
)

In [54]:
rag_pipeline(query)

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Female breast cancer is the second most common cancer in the world. Several efforts in artificial intelligence have been made to help improving the diagnostic accuracy at earlier stages. However, the identification of breast abnormalities, like masses, on mammographic images is not a trivial task, especially for dense breasts. In this paper we describe our novel mass detection process that includes three successive steps of enhancement, characterization and classification. The proposed enhancement system is based mainly on the analysis of the breast texture. First of all, a filtering step with morphological operators and soft thresholding is achie

{'query': 'What are the three successive steps involved in the novel mass detection process described for identifying breast abnormalities on mammographic images, according to the paper?',
 'result': ' According to the paper, the three successive steps involved in the novel mass detection process are enhancement, characterization, and classification. The proposed enhancement system is based mainly on the analysis of the breast texture, followed by characterization using measurement of the Gaussian density in the wavelet domain, and finally, a comparative classification mechanism based on the Bayesian regularization back-propagation networks and ANFIS techniques.'}

## **Task 2: Advanced RAG Techniques and Evaluation (4 + 5 = 9 points)**

Now that you have successfully implemented your first RAG system, we dive into more advanced techniques and learn how to evaluate your methods using metrics you learned during the lecture. We focus on evaluation with an already annotated dataset. To this end, we load a small subset of [NarrativeQA](https://huggingface.co/datasets/narrativeqa), which is an English-language dataset of stories and corresponding questions designed to test reading comprehension, especially on long documents. We only load 30 samples from the data, as you will see in the upcoming sections, answer generation takes quite some time. In actual setting, it is advised to use a much larger set to obtain statistically significant results.

In [None]:
from datasets import load_dataset
dataset = load_dataset("satyaalmasian/narrativeqa_subset",split="train[:30]")
len(dataset)

Downloading readme:   0%|          | 0.00/997 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.27M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/317 [00:00<?, ? examples/s]

30

Since we already used our free index in Pinecone for the previous task, we use Chroma, an open-source vector database, instead. As opposed to Pinecone, Chroma creates a collection on your machine.

In [None]:
from langchain.docstore.document import Document
documents=[ doc["text"] for doc in dataset["document"]]
questions=[quest for quest in dataset["question"]]
answers=[ans for ans in dataset["answers"]]
documents=list(set(documents))

In [None]:
docs= [Document(page_content=doc, metadata={"source": "local"}) for doc in documents]

The number of documents is smaller  than the number of questions and answers and each document is used as a reference for multiple questions:

In [None]:
print(len(docs))
print(len(questions))

2
30


##Subtask 2.1: Build Contextual Compression in LangChain

Let's split our documents using the TextSplitter from Task 1 and embed them inside the Chroma database with the embedding model of the previous task.

In [None]:
### your code ###
all_splits = text_splitter.split_documents(docs)
### your code ###

In [None]:
from langchain.vectorstores import Chroma
### your code ###
vectordb = Chroma.from_documents(documents=all_splits, embedding=embed_model, persist_directory="chroma_db")
retriever =vectordb.as_retriever(search_kwargs={"k": 5})
### your code ###

In [None]:
print("Fist question in the set:",questions[2]['text'])
r_docs = retriever.get_relevant_documents(questions[2]['text'])
r_docs

Fist question in the set: Why do more students tune into Mark's show?


[Document(page_content="Reporter #2 - Are you on drugs?\n\nPaige - Arrrgh. Talk Hard. Arrrrrgh.\n\nMark - I've got a lot of homework I'm gonna take off alright.\n\nMarla - Mark I know why your really going home. It's because you wanna listen to that \nshow tonight don't you?\n\n<Play Peter Murphy>\n\n<Nora goes to Marks house where she finds him burning his Happy Harry Hardon \nletters>\n\nNora - Hi! What are you doing? You having fun?\n\nMark - Yeah.\n\nNora - Hey, look I took some of these off the wall for you. I mistakingly thought you \nmight want them.\n\nMark - Thanks.\n\nNora - So I guess you're not going on tonight.\n\nMark - Brilliant.\n\nNora - Is this all just a game to you. You know you can't just shout fire in a theatre and \nwalk out. You have a responsibility for the people who believe in you. What is this? \nC'mon say something, say anything. Open your mouth and say get the hell out of here \nbitch.\n\nMark - I can't.\n\nNora - You can't what?\n\nMark - I can't talk.\n\

First, make a simple RAG pipeline that works on top of the Chroma retriever. This retriever should be similar to the previous task. However, since we want to use it for a large number of questions, remove the `verbose` parameters.

In [None]:
from langchain.chains import RetrievalQA
### your code ###
rag_simple= RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever
)
### your code ###

We look at an example question and compare the answer by RAG to the gold answer from the dataset. Note that the answers can contain multiple lines.

In [None]:
rag_simple(questions[2]['text']) #ignore the warning



{'query': "Why do more students tune into Mark's show?",
 'result': ' Mark has become a symbol of rebellion against the strict rules and expectations of the school. His show provides an outlet for students to express their frustrations and desires, and he has gained a reputation as someone who is willing to challenge the status quo. As a result, many students tune in to his show as a way to connect with someone who understands their feelings and desires.'}

In [None]:
answers[2]

[{'text': 'Mark talks about what goes on at school and in the community.',
  'tokens': ['Mark',
   'talks',
   'about',
   'what',
   'goes',
   'on',
   'at',
   'school',
   'and',
   'in',
   'the',
   'community',
   '.']},
 {'text': 'Because he has a thing to say about what is happening at his school and the community.',
  'tokens': ['Because',
   'he',
   'has',
   'a',
   'thing',
   'to',
   'say',
   'about',
   'what',
   'is',
   'happening',
   'at',
   'his',
   'school',
   'and',
   'the',
   'community',
   '.']}]

Apply the `rag_simple` pipeline to all the question in your corpus and accumulate the answers. **It should take around 10 minutes on a T4 GPU on Colab**.

In [None]:
simple_answers=[]
### your code ###
for quest in tqdm(questions):
  simple_answers.append(rag_simple(quest['text'])['result'])
### your code ###

  warn_deprecated(
100%|██████████| 30/30 [12:03<00:00, 24.10s/it]


Libraries such as LangChain and [Llamaindex](https://www.llamaindex.ai/) provide a variety of retrieval strategies for building a RAG system. In this subtask, you will use one of these variations called **contextual compression**. This method aims to extract only the relevant information from documents, reducing the need for expensive language model calls and improving response quality. Contextual compression consists of two parts:


1.  **Base retriever:** retrieves the initial set of documents based on the query. This is similar to the retriever from the previous task.
2.   **Document compressor:** processes these documents to extract the relevant content. We use `LLMChainExtractor`, which will iterate over the initially returned documents and extract from each only the content that is relevant to the query.


In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor,LLMChainFilter
from langchain.llms import OpenAI

### your code ###
compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(base_compressor=compressor,
                                                       base_retriever=retriever)
### your code ###

Let's take a look at an example of compression retriever works.

In [None]:
print("Fist question in the set:",questions[2]['text'])
compressed_docs = compression_retriever.get_relevant_documents(questions[2]['text'])
compressed_docs



Fist question in the set: Why do more students tune into Mark's show?




[Document(page_content='* "Why do more students tune into Mark\'s show?"\n* "Mark\'s show"\n* "students"', metadata={'source': 'local'}),
 Document(page_content='* "Why do more students tune into Mark\'s show?"\n* "Mark\'s show"\n* "students"', metadata={'source': 'local'}),
 Document(page_content='* "Why do more students tune into Mark\'s show?"\n* "Mark\'s show"\n* "students"', metadata={'source': 'local'}),
 Document(page_content='* Nora got expelled\n* Nora has been cutting lessons\n* Creswood is mentioned as a staff member', metadata={'source': 'local'}),
 Document(page_content='* Nora got expelled\n* Nora has been cutting lessons\n* Creswood is mentioned as a staff member', metadata={'source': 'local'})]

Look at the output and try out several different questions by yourself. Does the compressed output make sense?

Compare this to the previous **simple** approach. Which one, in your opinion, is better?

Finally, we use the new retriever with the Llama2 model from the previous task to create the context compressor RAG pipeline. The code below should be similiar to what you did in the previous task. Once again, make sure to turn off the `verbose` argument.

In [None]:
### your code ###
from langchain.chains import RetrievalQA

rag_compressor = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever
)
### your code ###


In [None]:
rag_compressor(questions[2]['text'])



{'query': "Why do more students tune into Mark's show?",
 'result': " Because Mark's show is really popular and entertaining!"}

Now we can use the pipeline to generate answers for all the questions in our dataset. **It should take around 20 minutes on a T4 GPU on Colab.**

In [None]:
compressor_answers=[]
### your code ###
for quest in tqdm(questions):
  compressor_answers.append(rag_compressor(quest['text'])['result'])
### your code ###


100%|██████████| 30/30 [25:14<00:00, 50.47s/it]


✅ Point distribution ✅
- 0.5 point if the text is correctly split.
- 1 point for initializing Chroma db as a retreiever and feeding the documents.
- 0.5 point for simple RAG pipline.
- 0.25 point for generating answers with simple RAG.
- 1 point for the correct compressor retriever.
- 0.5 point for compressor RAG pipline.
- 0.25 point for generating answers with compressor RAG.



#### ${\color{red}{Comments\ 2.1}}$

${\color{red}{⚠️Comments\ begin⚠️}}$


```
cross-feedback comment section
```


${\color{red}{⚠️Comments\ end⚠️}}$

##Subtask 2.2. Evaluate

Since we have access to ground truth answers, we can use various evaluation metrics from the literature. In this task, we explore three metrics:


1.   **BLEU:** BLEU score stands for Bilingual Evaluation Understudy and is a precision-based metric developed
for evaluating machine translation. BLEU scores a candidate by computing the
number of n-grams in the candidate that also appear
in a reference. The n can vary, in this task we compute for n=4.
2.   **ROUGE:** ROUGE score stands for Recall-Oriented Understudy for Gisting Evaluation and is an F-measure metric designed for
evaluating translation and summarization. There are a number of variants of ROUGE.
3. **BERTScore:** BERTScore first obtains BERT representation of each word in the candidate and reference by feeding the candidate
and reference through a BERT model separately.
An alignment is then computed between candidate
and reference words by computing pairwise cosine
similarity. This alignment is then aggregated in to
precision and recall scores before being aggregated
into a (modified) F1 score that is weighted using
inverse-document-frequency values.

Luckily, Hugging Face has an implementation for all these metrics. Use the `evaluate` library to load the metrics.

Use the loaded metrics to compare the RAG pipelines from the previous subtask.

In [None]:
import evaluate
### your code ###
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')
bertscore = evaluate.load("bertscore")
### your code ###

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

As seen in the previous subtask, the answers can contain multiple lines. To be able to compare the output of our systems to the gold answers, merge the multiple answers into a single string.

In [None]:
answers_merged=[]
### your code ###
for answer in answers:
  multi_part=[]
  for ans in answer:
    multi_part.append(ans['text'])
  answers_merged.append(' '.join(multi_part))
### your code ###
print(len(answers_merged))

30


Compute the BLUE score for the simple RAG and compressor RAG.

In [None]:
### your code ###
bleu_simple = bleu.compute(predictions=simple_answers, references=answers_merged)
bleu_compressor = bleu.compute(predictions=compressor_answers, references=answers_merged)
### your code ###
print("Simple system:")
print(bleu_simple)
print("Compressor:")
print(bleu_compressor)

Simple system:
{'bleu': 0.0, 'precisions': [0.11001410437235543, 0.010309278350515464, 0.0015408320493066256, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 2.39527027027027, 'translation_length': 709, 'reference_length': 296}
Compressor:
{'bleu': 0.0, 'precisions': [0.1066066066066066, 0.007861635220125786, 0.0016501650165016502, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 2.25, 'translation_length': 666, 'reference_length': 296}


What does the elements below in the output of the BLEU impelementation in Hugging Face mean? (do not copy and paste the documentation but write the implications behind each element!).



1.   **precisions:** `your answer`
2.   **brevity_penalty:** `your answer`
3.   **translation_length:** `your answer`
4.   **reference_length:** `your answer`
5.   **length_ratio:** `your answer`




**Answer:**


1.   **precisions:** precision of n-grams, which is calculated as the number of n-grams that appear in both the machine-generated translation and the reference translations divided by the total number of n-grams in the machine-generated translation.
2.   **brevity_penalty:** is a penalty term that adjusts the score for translations that are shorter than the reference translations. It is calculated as min(1, (reference_length / translation_length)). It essentially penalizes generated translations that are too short compared to the closest reference length with an exponential decay.
3.   **translation_length:**   is the total number of words in the machine-generated translation.
4.   **reference_length:**  is the total number of words in the reference translations.
5. **length_ratio:** ratio of the 3 and 4.

In [None]:
### your code ###
rouge_simple = rouge.compute(predictions=simple_answers,references=answers_merged)
rouge_compressor = rouge.compute(predictions=compressor_answers,references=answers_merged)
### your code ###
print("Simple system:")
print(rouge_simple)
print("Compressor:")
print(rouge_compressor)

Simple system:
{'rouge1': 0.12296939231362755, 'rouge2': 0.018555984555984558, 'rougeL': 0.11096363586174168, 'rougeLsum': 0.11029701643933229}
Compressor:
{'rouge1': 0.12001440874773897, 'rouge2': 0.02168461243058017, 'rougeL': 0.10570344091836636, 'rougeLsum': 0.10562855990595829}


What is the difference in variants of ROUGE (ROUGE-N, ROUGE-L, ROUGE-SUM)?

`your answer`


**Answer:**

ROUGE measures the similarity between the machine-generated summary and the reference summaries using overlapping n-grams, word sequences that appear in both the machine-generated summary and the reference summaries. The most common n-grams used are unigrams, bigrams, and trigrams. ROUGE score calculates the recall of n-grams in the machine-generated summary by comparing them to the reference summaries.

**ROUGE-N:** ROUGE-N measures the overlap of n-grams (contiguous sequences of n words) between the candidate text and the reference text. It computes the precision, recall, and F1-score based on the n-gram overlap. For example, ROUGE-1 (unigram) measures the overlap of single words, ROUGE-2 (bigram) measures the overlap of two-word sequences, and so on. ROUGE-N is often used to evaluate the grammatical correctness and fluency of generated text.

**ROUGE-L:** ROUGE-L measures the longest common subsequence (LCS) between the candidate text and the reference text. It computes the precision, recall, and F1-score based on the length of the LCS. ROUGE-L is often used to evaluate the semantic similarity and content coverage of generated text, as it considers the common subsequence regardless of word order.

**ROUGE-S:** ROUGE-S measures the skip-bigram (bi-gram with at most one intervening word) overlap between the candidate text and the reference text. It computes the precision, recall, and F1-score based on the skip-bigram overlap. ROUGE-S is often used to evaluate the coherence and local cohesion of generated text, as it captures the semantic similarity between adjacent words.



In [None]:
import numpy as np
### your code ###
bertscore_simple = bertscore.compute(predictions=simple_answers, references=answers_merged, lang="en")
bertscore_compressor = bertscore.compute(predictions=compressor_answers, references=answers_merged, lang="en")
bertscore_simple_averaged={}
bertscore_compressor_averaged={}
for key in bertscore_simple.keys():
  if key!='hashcode':
    bertscore_simple_averaged[key]=np.mean(bertscore_simple[key])
    bertscore_compressor_averaged[key]=np.mean(bertscore_compressor[key])

### your code ###
print("Simple system:")
print(bertscore_simple_averaged)
print("Compressor:")
print(bertscore_compressor_averaged)

Simple system:
{'precision': 0.8435829440752666, 'recall': 0.8557040333747864, 'f1': 0.8494029025236766}
Compressor:
{'precision': 0.8397547423839569, 'recall': 0.8533495982487996, 'f1': 0.8463238557179769}


Which model works better?

✅ Point distribution ✅
- 0.5 point for loading the metrics.
- 0.5 point for parsing the answers.
- 0.5 point computation of BLEU.
- 0.25 *5 = 1.25 points for meaning of each part of BLEU score.
- 0.5 point computation of ROUGE.
- 0.25 *3= 0.75 point for variants for ROUGE.
- 1 point computation of BERTScore.



#### ${\color{red}{Comments\ 2.2}}$

${\color{red}{⚠️Comments\ begin⚠️}}$


```
cross-feedback comment section
```


${\color{red}{⚠️Comments\ end⚠️}}$