In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from typing import List, Tuple, Any

import numpy as np
import time
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

from optimum.onnxruntime import AutoOptimizationConfig, ORTModelForFeatureExtraction, ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig
from optimum.pipelines import pipeline
import torch.nn.functional as F

In [None]:
# Load the tokenizer and export the model to the ONNX format
# model_id = "sentence-transformers/all-MiniLM-L6-v2"
# model_id = "thenlper/gte-base"
# model_id = "intfloat/multilingual-e5-large"
model_id = "BAAI/bge-base-en"
save_dir = f"fast-{model_id.split('/')[1]}"
print(save_dir)

In [None]:
hf_model = AutoModel.from_pretrained(model_id)
hf_tokenizer = AutoTokenizer.from_pretrained(model_id)

# The input texts can be in any language, not just English.
# Each input text should start with "query: " or "passage: ", even for non-English texts.
# For tasks other than retrieval, you can simply use the "query: " prefix.
input_texts = [
    "query: how much protein should a female eat",
    "query: 南瓜的家常做法",
    "query: भारत का राष्ट्रीय खेल कौन-सा है?", # Hindi text
    "query: భారత్ దేశంలో రాష్ట్రపతి ఎవరు?", # Telugu text
    "query: இந்தியாவின் தேசிய கோப்பை எது?", # Tamil text
    "query: ಭಾರತದಲ್ಲಿ ರಾಷ್ಟ್ರಪತಿ ಯಾರು?", # Kannada text
    "query: ഇന്ത്യയുടെ രാഷ്ട്രീയ ഗാനം എന്താണ്?", # Malayalam text
]

english_texts = [
    "India: Where the Taj Mahal meets spicy curry.",
    "Machine Learning: Turning data into knowledge, one algorithm at a time.",
    "Python: The language that makes programming a piece of cake.",
    "fastembed: Accelerating embeddings for lightning-fast similarity search.",
    "Qdrant: The ultimate tool for high-dimensional indexing and search."
]

In [None]:

def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def hf_embed(model_id: str, inputs: List[str]):
    # Tokenize the input texts
    batch_dict = hf_tokenizer(inputs, max_length=512, padding=True, truncation=True, return_tensors="pt")

    outputs = hf_model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])

    # normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    scores = (embeddings[:2] @ embeddings[2:].T) * 100
    # print(scores.tolist())
    return scores

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ORTModelForFeatureExtraction.from_pretrained(model_id, export=True)

# Remove all existing files in the save_dir using Path.unlink()
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
for p in save_dir.iterdir():
    p.unlink()

# Load the optimization configuration detailing the optimization we wish to apply
optimization_config = AutoOptimizationConfig.O3()
optimizer = ORTOptimizer.from_pretrained(model)

optimizer.optimize(save_dir=save_dir, optimization_config=optimization_config, use_external_data_format=True)
model = ORTModelForFeatureExtraction.from_pretrained(save_dir)

tokenizer.save_pretrained(save_dir)
# model.save_pretrained(save_dir)
# model.push_to_hub("new_path_for_directory", repository_id="my-onnx-repo", use_auth_token=True)

In [None]:
onnx_quant_embed = pipeline("feature-extraction", model=model, accelerator="ort")


def measure_pipeline_time(pipeline, input_texts: List[str], num_runs = 10, **kwargs: Any) -> Tuple[float, float]:
    """Measures the time it takes to run the pipeline on the input texts."""
    times = []
    total_chars = sum(len(text) for text in input_texts)
    for _ in range(num_runs):
        start_time = time.time()
        _ = pipeline(inputs=input_texts, **kwargs)
        end_time = time.time()
        times.append(end_time - start_time)

    mean_time = np.mean(times)
    std_dev = np.std(times)
    chars_per_second = total_chars / mean_time
    return mean_time, std_dev, chars_per_second


# Ours

In [None]:
_, _, chars_per_sec = measure_pipeline_time(onnx_quant_embed, input_texts)
print(f"Multilingual Speed: {chars_per_sec:.2f} chars/sec")
_, _, chars_per_sec = measure_pipeline_time(onnx_quant_embed, english_texts)
print(f"English Speed: {chars_per_sec:.2f} chars/sec")

# Original

In [None]:
_, _, chars_per_sec = measure_pipeline_time(hf_embed, input_texts=input_texts, model_id=model_id)
print(f"Multilingual Speed: {chars_per_sec:.2f} chars/sec")
_, _, chars_per_sec = measure_pipeline_time(hf_embed, input_texts=english_texts, model_id=model_id)
print(f"English Speed: {chars_per_sec:.2f} chars/sec")

# Compress & Upload

## Compress

In [None]:
import os
from pathlib import Path
import tarfile


def compress(directory_path):
    directory_path = Path(directory_path)
    assert directory_path.exists(), f"{directory_path} does not exist"
    output_filename = directory_path.name + ".tar.gz"
    if Path(output_filename).exists():
        print("We've an output file already? Manually delete that first")
        return output_filename

    with tarfile.open(output_filename, "w:gz") as tar:
        tar.add(directory_path, arcname=os.path.basename(directory_path))
    return output_filename


compressed_file_name = compress(save_dir)

## Upload to Qdrant Google Cloud Storage

In [None]:
from google.cloud import storage


def upload(bucket_name, source_file_path):
    storage_client = storage.Client(project="main")
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(os.path.basename(source_file_path))

    blob.upload_from_filename(source_file_path)

    print(f"File {source_file_path} uploaded to {bucket_name}.")


upload("qdrant-fastembed", compressed_file_name)

In [None]:
# Remove the directory and compressed file
!rm -rvf {save_dir}
!rm -vf {save_dir}.tar.gz