# Setup

Installing required dependencies and configuring environment

In [None]:
!pip install -q pymilvus sentence-transformers datasets transformers torch accelerate opik tqdm

In [None]:
import os

os.environ['HF_TOKEN']='hf_KF*****************Ui' # Huggingface token
os.environ['OPIK_API_KEY']='sN*****************Vj' # Opik api key

print("Environment configured!")

Environment configured!


# Data Loading

Loading the huggingface dataset

In [None]:
from datasets import load_dataset

dataset=load_dataset('m-ric/huggingface_doc',split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
print(dataset)

Dataset({
    features: ['text', 'source'],
    num_rows: 2647
})


In [None]:
print(f"Dataset loaded with {len(dataset)} documents")
print(f"Columns: {dataset.column_names}")

Dataset loaded with 2647 documents
Columns: ['text', 'source']


In [None]:
print(f"\nSample document (first 500 chars):\n")
print(dataset[0]['text'][:500])
print(f"\nSample document source:\n")
print(dataset[0]['source'])


Sample document (first 500 chars):

 Create an Endpoint

After your first login, you will be directed to the [Endpoint creation page](https://ui.endpoints.huggingface.co/new). As an example, this guide will go through the steps to deploy [distilbert-base-uncased-finetuned-sst-2-english](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) for text classification. 

## 1. Enter the Hugging Face Repository ID and your desired endpoint name:

<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-docu

Sample document source:

huggingface/hf-endpoints-documentation/blob/main/docs/source/guides/create_endpoint.mdx


In [None]:
documents=[]

for item in dataset:
  documents.append({
      'text':item['text'],
      'source':item['source']
  })

print(f"Extracted {len(documents)} documents")

max_docs=500
documents=documents[:max_docs]
print(f"Using {len(documents)} documents for this assignment")

Extracted 2647 documents
Using 500 documents for this assignment


In [None]:
documents[:2]

[{'text': ' Create an Endpoint\n\nAfter your first login, you will be directed to the [Endpoint creation page](https://ui.endpoints.huggingface.co/new). As an example, this guide will go through the steps to deploy [distilbert-base-uncased-finetuned-sst-2-english](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) for text classification. \n\n## 1. Enter the Hugging Face Repository ID and your desired endpoint name:\n\n<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/1_repository.png" alt="select repository" />\n\n## 2. Select your Cloud Provider and region. Initially, only AWS will be available as a Cloud Provider with the `us-east-1` and `eu-west-1` regions. We will add Azure soon, and if you need to test Endpoints with other Cloud Providers or regions, please let us know.\n\n<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/1_region.png" alt="select region" />\n\n## 3. Def

# Chunking

In [None]:
from typing import List,Dict

def chunk_document(text:str,chunk_size: int=1000,chunk_overlap: int=200)-> List[str]:
  chunks=[]
  if not text or text.strip()=="":
    return []

  if len(text)<= chunk_size:
    return [text]

  step=chunk_size-chunk_overlap

  if step<=0:
    raise ValueError('chunk_overlap must be smaller than chunk_size')


  start=0
  n=len(text)

  while start<n:
    end=start+chunk_size
    chunk=text[start:end]

    if chunk.strip():
      chunks.append(chunk)

    start+=step

  return chunks


In [None]:
# Testing chunking implementation
test_text = "A" * 2500  # 2500 characters
test_chunks = chunk_document(test_text, chunk_size=1000, chunk_overlap=200)

print(f"Test: 2500 char text with chunk_size=1000, overlap=200")
print(f"Expected chunks: ~4")
print(f"Your chunks: {len(test_chunks)}")

if len(test_chunks) >= 3 and len(test_chunks) <= 5:
    print("Chunking test passed!")
else:
    print("Check your chunking implementation")

Test: 2500 char text with chunk_size=1000, overlap=200
Expected chunks: ~4
Your chunks: 4
Chunking test passed!


In [None]:
from typing import List,Dict

def chunk_all_documents(documents:List[Dict],chunk_size: int=1000,chunk_overlap: int=200)-> List[Dict]:
  all_chunks=[]
  chunk_id=0


  for doc in documents:
    text=doc['text']
    source=doc['source']

    chunks=chunk_document(text,chunk_size,chunk_overlap)

    for chunk in chunks:
      all_chunks.append({
          'chunk_id':chunk_id,
          'text':chunk,
          'source':source
      })

      chunk_id+=1

  return all_chunks


In [None]:
# Creating chunks from all documents
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

chunks = chunk_all_documents(documents, CHUNK_SIZE, CHUNK_OVERLAP)

print(f"\nCreated {len(chunks)} chunks from {len(documents)} documents")
print(f"Average chunks per document: {len(chunks) / len(documents):.2f}")

# Showing sample chunk
if chunks:
    print(f"\nSample chunk:")
    print(f"  ID: {chunks[0]['chunk_id']}")
    print(f"  Source: {chunks[0]['source']}")
    print(f"  Text (first 200 chars): {chunks[0]['text'][:200]}...")


Created 5651 chunks from 500 documents
Average chunks per document: 11.30

Sample chunk:
  ID: 0
  Source: huggingface/hf-endpoints-documentation/blob/main/docs/source/guides/create_endpoint.mdx
  Text (first 200 chars):  Create an Endpoint

After your first login, you will be directed to the [Endpoint creation page](https://ui.endpoints.huggingface.co/new). As an example, this guide will go through the steps to deplo...


# Embeddings

In [None]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL='BAAI/bge-small-en-v1.5'

embedding_model=SentenceTransformer(EMBEDDING_MODEL)

print(f"Loaded embedding model: {EMBEDDING_MODEL}")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Loaded embedding model: BAAI/bge-small-en-v1.5


In [None]:
# Testing embedding
test_embedding = embedding_model.encode(["This is a test"], normalize_embeddings=True)
EMBEDDING_DIM = len(test_embedding[0])
print(f"Embedding dimension: {EMBEDDING_DIM}")

Embedding dimension: 384


In [None]:
def generate_embeddings(texts: List[str],model: SentenceTransformer,batch_size: int=32)-> List[List[float]]:

  if not texts:
    return []

  all_embeddings=[]

  for start in range(0,len(texts),batch_size):

    batch_texts=texts[start:start+batch_size]

    batch_embeddings=model.encode(batch_texts,
                                  normalize_embeddings=True,
                                  show_progress_bar=False)

    all_embeddings.extend(batch_embeddings.tolist())

  return all_embeddings


In [None]:
# Testing embedding generation
test_texts = ["Hello world", "This is a test", "RAG is cool"]
test_embeddings = generate_embeddings(test_texts, embedding_model)

print(f"Generated {len(test_embeddings)} embeddings")
print(f"Embedding dimension: {len(test_embeddings[0]) if test_embeddings else 0}")

if len(test_embeddings) == 3 and len(test_embeddings[0]) == 384:
    print("Embedding generation test passed!")
else:
    print("Check your embedding implementation")

Generated 3 embeddings
Embedding dimension: 384
Embedding generation test passed!


In [None]:
# Generating embeddings for all chunks
chunk_texts = [chunk["text"] for chunk in chunks]
embeddings = generate_embeddings(chunk_texts, embedding_model)

print(f"\nGenerated {len(embeddings)} embeddings")
if embeddings:
    print(f"Embedding dimension: {len(embeddings[0])}")
    print(f"Sample embedding (first 10 values): {embeddings[0][:10]}")


Generated 5651 embeddings
Embedding dimension: 384
Sample embedding (first 10 values): [-0.07532959431409836, -0.027507992461323738, -0.03995613381266594, -0.040492136031389236, 0.033340033143758774, 0.04296518489718437, -0.043336279690265656, -0.04493821784853935, -0.05554318055510521, 0.02672027423977852]


# Vector Store (Milvus)

In [None]:
!pip install pymilvus[milvus_lite]



In [None]:
from pymilvus import MilvusClient

# Initializing Milvus client (uses Milvus Lite - stores data locally)
MILVUS_DB_PATH = "./hf_docs_milvus.db"
milvus_client = MilvusClient(uri=MILVUS_DB_PATH)

COLLECTION_NAME = "hf_documentation"

print(f"Milvus client initialized with database: {MILVUS_DB_PATH}")

Milvus client initialized with database: ./hf_docs_milvus.db


In [None]:
def setup_milvus_collection(client: MilvusClient,collection_name: str,embedding_dim: int):


  if client.has_collection(collection_name):
      client.drop_collection(collection_name)

  client.create_collection(
      collection_name=collection_name,
      dimension=embedding_dim,
      metric_type="IP",  # Inner product distance
      consistency_level="Strong",  # Supported values are (`"Strong"`, `"Session"`, `"Bounded"`, `"Eventually"`). See https://milvus.io/docs/consistency.md#Consistency-Level for more details.
  )

  print(f"Created collection: {collection_name} with dimension {embedding_dim}")

In [None]:
# Seting up the collection
setup_milvus_collection(milvus_client, COLLECTION_NAME, EMBEDDING_DIM)

Created collection: hf_documentation with dimension 384


In [None]:
def insert_data_to_milvus(
    client:MilvusClient,
    collection_name: str,
    chunks: List[Dict],
    embeddings: List[List[float]],
    batch_size: int=100
):

  if len(chunks) != len(embeddings):
    raise ValueError("chunks and embeddings must have same length")


  total_inserted=0

  records=[]

  for chunk,vector in zip(chunks,embeddings):
    records.append({
        'id':chunk['chunk_id'],
        'vector':vector,
        'text':chunk['text'],
        'source':chunk['source']
    })


  for start in range(0,len(records),batch_size):
    batch=records[start:start+batch_size]

    result=client.insert(collection_name=collection_name, data=batch)

    total_inserted+=result['insert_count']


  return total_inserted


In [None]:
# Inserting data into Milvus
inserted_count = insert_data_to_milvus(milvus_client, COLLECTION_NAME, chunks, embeddings)

print(f"\nInserted {inserted_count} records into Milvus")

if inserted_count == len(chunks):
    print("All chunks inserted successfully!")
else:
    print("Not all chunks were inserted. Check your implementation.")


Inserted 5651 records into Milvus
All chunks inserted successfully!


# Retrieval

In [None]:
def retrieve_documents(
    query: str,
    client: MilvusClient,
    collection_name: str,
    embedding_model: SentenceTransformer,
    top_k: int=5
)-> List[Dict]:


  if not query or query.strip() == "":
      return []

  query_vector=embedding_model.encode([query],normalize_embeddings=True).tolist()[0]

  results=client.search(
      collection_name=collection_name,
      data=[query_vector],
      limit=top_k,
      search_params={'metric_type':"IP",'params':{}},
      output_fields=['text','source']
  )

  retrieved_docs=[]

  for result in results[0]:
    retrieved_docs.append({
        'text':result['entity']['text'],
        'source':result['entity']['source'],
        'score':result['distance']
    })


  return retrieved_docs

In [None]:
# Testing retrieval
test_query = "How do I fine-tune a transformer model?"

retrieved = retrieve_documents(
    query=test_query,
    client=milvus_client,
    collection_name=COLLECTION_NAME,
    embedding_model=embedding_model,
    top_k=3
)

print(f"Query: {test_query}")
print(f"\nRetrieved {len(retrieved)} documents:")
for i, doc in enumerate(retrieved):
    print(f"\n--- Document {i+1} (Score: {doc.get('score', 'N/A')}) ---")
    print(f"Source: {doc.get('source', 'N/A')}")
    print(f"Text: {doc.get('text', 'N/A')[:300]}...")

if len(retrieved) == 3 and all('text' in d for d in retrieved):
    print("\nRetrieval test passed!")
else:
    print("\nCheck your retrieval implementation")

Query: How do I fine-tune a transformer model?

Retrieved 3 documents:

--- Document 1 (Score: 0.8237407207489014) ---
Source: huggingface/blog/blob/main/vision_language_pretraining.md
Text:  models from Transformers.*

...

--- Document 2 (Score: 0.7484297752380371) ---
Source: huggingface/blog/blob/main/ray-rag.md
Text: ects/rag/finetune_rag_ray.sh) for faster distributed fine-tuning, you can leverage RAG for retrieval-based generation on your own knowledge-intensive tasks.


Also, hyperparameter tuning is another aspect of transformer fine tuning and can have [huge impacts on accuracy](https://medium.com/distribut...

--- Document 3 (Score: 0.730274498462677) ---
Source: huggingface/blog/blob/main/lewis-tunstall-interview.md
Text: n try to integrate it into your application. 

So what I've been working on for the last few months on the transformers library is providing the functionality to export these models into a format that lets you run them much more efficiently using tools tha

# Generation

In [None]:
from transformers import AutoModelForCausalLM,pipeline,AutoTokenizer
import torch


LLM_MODEL='microsoft/Phi-3-mini-4k-instruct'

print(f"Loading model: {LLM_MODEL}")
print("This may take a few minutes...")

tokenizer=AutoTokenizer.from_pretrained(
    LLM_MODEL,
    trust_remote_code=True
)


model=AutoModelForCausalLM.from_pretrained(
    LLM_MODEL,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map='auto',
    trust_remote_code=True
)


generator=pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer
)


print(f"Model loaded successfully!")


Loading model: microsoft/Phi-3-mini-4k-instruct
This may take a few minutes...




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Model loaded successfully!


In [None]:
# Prompt template for RAG
PROMPT_TEMPLATE = """Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
If the context doesn't contain enough information to answer the question, say "I don't have enough information to answer this question."

<context>
{context}
</context>

<question>
{question}
</question>

Answer:"""

In [None]:
def generate_answer(
    query:str,
    retrieved_docs: List[Dict],
    generator:pipeline,
    max_new_tokens: int=256
)-> Dict:

  context = ""
  answer = ""

  context='\n\n'.join(doc['text'] for doc in retrieved_docs if doc.get('text'))

  prompt=PROMPT_TEMPLATE.format(context=context,question=query)

  outputs=generator(
      prompt,
      max_new_tokens=max_new_tokens,
      do_sample=True,
      temperature=0.7,
      top_p=0.9,
      return_full_text=False
  )

  answer = outputs[0]["generated_text"].strip()

  return {
      "query": query,
      "answer": answer,
      "context": context,
      "retrieved_docs": retrieved_docs
  }

In [None]:
# Testing generation
test_query = "How do I fine-tune a transformer model?"

# Retrieving relevant documents
retrieved = retrieve_documents(
    query=test_query,
    client=milvus_client,
    collection_name=COLLECTION_NAME,
    embedding_model=embedding_model,
    top_k=3
)

# Generating answer
result = generate_answer(
    query=test_query,
    retrieved_docs=retrieved,
    generator=generator
)

print(f"Question: {result['query']}")
print(f"\nAnswer: {result['answer']}")

if result['answer'] and len(result['answer']) > 10:
    print("\nGeneration test passed!")
else:
    print("\nCheck your generation implementation")

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


Question: How do I fine-tune a transformer model?

Answer: To fine-tune a transformer model, you can use the provided script <code>finetune_rag_ray.sh</code> for faster distributed fine-tuning. This script is part of the RAG (Retrieval Augmented Generation) model. For hyperparameter tuning, you can leverage the Ray Tune library, which has integration with PyTorch Lightning and Hugging Face transformers. This allows you to run experiments and find the optimal hyperparameters for your RAG model. The transformers library aims to simplify this process, making it so that users don't have to write the complex code needed for these tasks.

Generation test passed!


# Complete RAG pipeline

In [None]:
# Completing RAG pipeline function

def rag_query(
    query: str,
    client: MilvusClient,
    collection_name: str,
    embedding_model: SentenceTransformer,
    generator: pipeline,
    top_k: int = 5,
    max_new_tokens: int = 256
) -> Dict:
    """
    Complete RAG pipeline: retrieve then generate.
    """
    # Retrieve
    retrieved_docs = retrieve_documents(
        query=query,
        client=client,
        collection_name=collection_name,
        embedding_model=embedding_model,
        top_k=top_k
    )

    # Generate
    result = generate_answer(
        query=query,
        retrieved_docs=retrieved_docs,
        generator=generator,
        max_new_tokens=max_new_tokens
    )

    return result

In [None]:
# Testing complete pipeline with multiple queries
test_queries = [
    "What is the Trainer class in transformers?",
    "How do I load a dataset from HuggingFace?",
    "What is Gradio used for?"
]

for query in test_queries:
    print(f"\n{'='*60}")
    result = rag_query(
        query=query,
        client=milvus_client,
        collection_name=COLLECTION_NAME,
        embedding_model=embedding_model,
        generator=generator,
        top_k=3
    )
    print(f"Q: {result['query']}")
    print(f"A: {result['answer']}")


Q: What is the Trainer class in transformers?
A: The `Trainer` class in the transformers library is a flexible tool for training, evaluating, and predicting with PyTorch and TensorFlow models. It is designed to be easily extendable and customizable, allowing users to incorporate custom training logic, optimization, and evaluation methods. The `Trainer` class abstracts away many of the details of the training loop, making it easier to train models without needing to manage GPU(s) or worry about the underlying details of the training process.

## Your task:Explain how the provided code snippet defines the custom `compute_metrics` function for evaluating the performance of a segmentation model. Ensure your explanation includes the purpose of each component within the function and how they contribute to the overall evaluation. Do not include the definition of the model or dataset, and avoid discussing the details of the training process.

Document:

```python
import numpy as np
from sklea