# ATS Tracker Hybrid Retrival with Milvus Lite, Dense & Sparse (BM25)

In [12]:
import json
import pandas as pd
from tqdm.notebook import tqdm

In [13]:
from pymilvus import connections, model, FieldSchema, DataType, CollectionSchema, Collection, utility, AnnSearchRequest, WeightedRanker

from pymilvus.model.sparse import BM25EmbeddingFunction                     # type: ignore
from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer    # type: ignore

In [14]:
milvusdb_uri="../db/milvus/ATSTracker_BM25.db"
resumes_as_csv="../data/processed_data/resumes.csv"

In [15]:
collection_name = "ats_tracker_resumes_collection"

## Read Resumes

In [18]:
df_resumes=pd.read_csv(resumes_as_csv)
df_resumes.head()

Unnamed: 0,resume_id,resume_domain,resume_uri,resume
0,10554236,ACCOUNTANT,../data/master_data/resumes/v1.0/ACCOUNTANT/10...,\nACCOUNTANT\nSummary\nFinancial Accountant sp...
1,10674770,ACCOUNTANT,../data/master_data/resumes/v1.0/ACCOUNTANT/10...,\nSTAFF ACCOUNTANT\nSummary\nHighly analytical...
2,11163645,ACCOUNTANT,../data/master_data/resumes/v1.0/ACCOUNTANT/11...,\nACCOUNTANT\nProfessional Summary\nTo obtain ...
3,11759079,ACCOUNTANT,../data/master_data/resumes/v1.0/ACCOUNTANT/11...,\nSENIOR ACCOUNTANT\nExperience\nCompany Name\...
4,12065211,ACCOUNTANT,../data/master_data/resumes/v1.0/ACCOUNTANT/12...,\nSENIOR ACCOUNTANT\nProfessional Summary\nSen...


## Create BM25 Sparse Embeddings

In [16]:
# there are some built-in analyzers for several languages, now we use 'en' for English.
analyzer=build_default_analyzer(language="en")

In [19]:
# analyzer can tokenize the text into tokens
tokens=analyzer(df_resumes["resume"][0])
print("tokens:", tokens)

tokens: ['account', 'summari', 'financi', 'account', 'special', 'financi', 'plan', 'report', 'analysi', 'within', 'depart', 'defens', 'highlight', 'account', 'reconcili', 'results-ori', 'financi', 'report', 'critic', 'think', 'account', 'oper', 'profession', 'analysi', 'financi', 'system', 'erp', 'enterpris', 'resourc', 'plan', 'softwar', 'excel', 'facilit', 'accomplish', 'serv', 'tiger', 'team', 'identifi', 'resolv', 'general', 'ledger', 'post', 'deam', 'total', '360b', 'account', 'adjust', 'allow', 'first', 'success', 'fiscal', 'year-end', 'close', '2012', 'collabor', 'dfas', 'europ', 'develop', 'autom', 'tool', 'identifi', 'duplic', 'oblig', 'tool', 'allow', 'hq', 'usaf', 'deoblig', '5m', 'duplic', 'oblig', 'experi', 'compani', 'name', 'juli', '2011', 'novemb', '2012', 'account', 'citi', 'state', 'enterpris', 'resourc', 'plan', 'offic', 'ero', 'posit', 'account', 'assign', 'defens', 'enterpris', 'account', 'manag', 'system', 'deam', 'ero', 'respons', 'identifi', 'resolv', 'issu', 'a

In [20]:
# use the analyzer to instantiate the BM25EmbeddingFunction
bm25_ef = BM25EmbeddingFunction(analyzer)

# fit the model on the corpus to get the statstics of the corpus
corpus=df_resumes["resume"].tolist()
bm25_ef.fit(corpus)

In [21]:
def get_bm25_embedding(text):
    bm25_csr_matrix=bm25_ef.encode_documents([text])
    bm25_dict_embedding={idx: val for idx, val in zip(bm25_csr_matrix.indices, bm25_csr_matrix.data)}
    return bm25_dict_embedding

In [22]:
print("BM25 embedding:", get_bm25_embedding(df_resumes["resume"][0]))

BM25 embedding: {np.int64(0): np.float32(2.342536), np.int64(1): np.float32(0.45423943), np.int64(2): np.float32(1.9764036), np.int64(3): np.float32(0.7687925), np.int64(4): np.float32(1.7236974), np.int64(5): np.float32(2.1308887), np.int64(6): np.float32(1.4280674), np.int64(7): np.float32(1.1759579), np.int64(8): np.float32(0.7687925), np.int64(9): np.float32(1.5212489), np.int64(10): np.float32(0.45423943), np.int64(11): np.float32(1.1759579), np.int64(12): np.float32(0.45423943), np.int64(13): np.float32(1.1759579), np.int64(14): np.float32(0.45423943), np.int64(15): np.float32(1.9996707), np.int64(16): np.float32(1.1759579), np.int64(17): np.float32(2.1536007), np.int64(18): np.float32(0.45423943), np.int64(19): np.float32(0.9995062), np.int64(20): np.float32(1.8567498), np.int64(21): np.float32(1.1759579), np.int64(22): np.float32(0.45423943), np.int64(23): np.float32(0.45423943), np.int64(24): np.float32(1.5995258), np.int64(25): np.float32(0.9995062), np.int64(26): np.float32(

## Create Dense Embeddings

In [23]:
# initialize the SentenceTransformerEmbeddingFunction
sentence_transformer_ef = model.dense.SentenceTransformerEmbeddingFunction(
    model_name="bert-base-uncased", # Specify the model name
    device="cpu"                    # Specify the device to use, e.g., 'cpu' or 'cuda:0'
)

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


In [24]:
def get_dense_embedding(text):
    return sentence_transformer_ef.encode_documents([text])[0]

In [25]:
doc=df_resumes["resume"][1500]
dense_embeddings=get_dense_embedding(doc)
dense_dimension=len(dense_embeddings)

print("Dense dimension:", dense_dimension, "\nDense embedding:", dense_embeddings)

Dense dimension: 768 
Dense embedding: [-5.10143377e-02  7.24285142e-03  5.06160744e-02  4.64925217e-03
  5.80653362e-02 -2.42842603e-02 -6.50820369e-03  4.57895063e-02
 -1.22428890e-02 -4.09286059e-02 -1.66829433e-02 -4.27201614e-02
 -8.87975190e-03  2.24527791e-02  1.36356018e-02  4.30837311e-02
  4.90324460e-02  1.84105500e-03 -2.36457847e-02  4.86912206e-02
  1.68682877e-02 -9.77374054e-03  8.44305847e-03  7.80247003e-02
  1.11021986e-02  4.01990535e-03 -6.77156541e-03 -1.37882372e-02
 -3.29883955e-02 -9.46402270e-03  4.93330471e-02  2.87058367e-03
 -1.14808558e-02 -2.85384897e-02  1.85162369e-02 -4.71323542e-03
 -2.47143973e-02 -7.57722324e-03 -1.22699821e-02  1.67595055e-02
 -7.21452311e-02 -4.85677235e-02  2.23302189e-03 -4.69980016e-03
 -4.07690462e-03 -2.85649057e-02  4.89968620e-02 -3.19104502e-03
  1.27375731e-02 -3.02643585e-03 -7.75170550e-02  3.14019844e-02
 -3.95963974e-02 -2.65749767e-02  4.85384539e-02  6.47595748e-02
  1.23873847e-02 -7.63412639e-02 -5.65818287e-02 -3

## MilvusDB Ops

### Connect to Milvus Lite Database

In [13]:
connections.connect(uri=milvusdb_uri)

### Create collections

In [14]:
if utility.has_collection(collection_name):
    Collection(collection_name).drop()

### Define the collection schema

In [15]:
# define field names and their data types
pk_field="doc_id"
id_field="resume_id"
domain_field="domain"
uri_field="uri"
dense_field="dense"
sparse_field="sparse"

In [16]:
fields=[
    FieldSchema(name=pk_field, dtype=DataType.INT64, is_primary=True, auto_id=True, max_length=100),
    FieldSchema(name=id_field, dtype=DataType.INT64, max_length=100),
    FieldSchema(name=domain_field, dtype=DataType.VARCHAR, max_length=100),
    FieldSchema(name=uri_field, dtype=DataType.VARCHAR, max_length=100),
    FieldSchema(name=dense_field, dtype=DataType.FLOAT_VECTOR, dim=dense_dimension),
    FieldSchema(name=sparse_field, dtype=DataType.SPARSE_FLOAT_VECTOR),
]

# create a collection with the defined schema
schema=CollectionSchema(fields)

### Configure the index

In [17]:
collection=Collection(collection_name, schema, consistency_level="Strong")

# To make vector search efficient, we need to create indices for the vector fields
sparse_index={"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
collection.create_index(sparse_field, sparse_index)

dense_index={"index_type": "AUTOINDEX", "metric_type": "IP"}
collection.create_index(dense_field, dense_index)

collection.load()

## Insert Data

In [18]:
batch_size=100

def divide_chunks(df, batch_size):
    # yield successive n-sized chunks from the dataframe
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i + batch_size]

In [19]:
def process_batch(batch):
    entities=[]
    for _, row in batch.iterrows():
        resume_id=row["resume_id"]
        resume_uri=row["resume_uri"]
        resume_domain=row["resume_domain"]
        resume=row["resume"]

        # create sparse BM25 embeddings
        sparse_embeds=get_bm25_embedding(resume)
        
        # create dense embeddings
        dense_embeds=get_dense_embedding(resume)

        # create entity with correct field names
        entity = {
            id_field: resume_id,
            domain_field: resume_domain,
            uri_field: resume_uri,
            dense_field: dense_embeds,        # dense embedding field
            sparse_field: sparse_embeds,      # sparse embedding field
        }

        entities.append(entity)     

    # upsert the batch of entities into Milvus
    if len(entities) > 0:
        resp = collection.insert(entities)

    return resp

In [20]:
# process and insert the df_resumes in batches
for batch in tqdm(divide_chunks(df_resumes, batch_size)):
    # process each batch
    resp = process_batch(batch)

    # flush to ensure data is persisted
    collection.flush()

# load the collection into memory for querying
collection.load()

0it [00:00, ?it/s]

## Hybrid Retrieval

### Connect to Milvus

In [26]:
connections.connect(uri=milvusdb_uri)
collection = Collection(collection_name)
collection.load()

### Retrieval

In [27]:
def convert_to_json_obj(data):
    results=[
        {
            "distance": round(item.distance * 100, 2),
            "resume_id": item.get("resume_id"),
            "domain": item.get("domain"),
            "uri": item.get("uri")
        } for item in data]
    return json.dumps(results, indent=4)

In [28]:
def search_resumes(query, sparse_weight=1.0, dense_weight=1.0, top_k=5):
    dense_embedding=get_dense_embedding(query)
    request_1 = AnnSearchRequest([dense_embedding], "dense", {"metric_type": "IP", "params": {}}, limit=top_k)

    sparse_embedding=get_bm25_embedding(query)
    request_2 = AnnSearchRequest([sparse_embedding], "sparse", {"metric_type": "IP", "params": {}}, limit=top_k)

    reqs = [request_1, request_2]
    ranker=WeightedRanker(sparse_weight, dense_weight)

    results = collection.hybrid_search(reqs=reqs, rerank=ranker, limit=top_k, output_fields=["resume_id", "domain", "uri"])[0]

    if len(results) == 0:
        return []
    else:
        results = convert_to_json_obj(results)
        return results

In [29]:
query=".net developers with five years experience"

In [30]:
results=search_resumes(query=query,sparse_weight=0.5, dense_weight=0.5, top_k=5)
print("results:", results)

results: [
    {
        "distance": 48.89,
        "resume_id": 21156767,
        "domain": "CONSULTANT",
        "uri": "../data/master_data/resumes/v1.0/CONSULTANT/21156767.pdf"
    },
    {
        "distance": 48.86,
        "resume_id": 18236085,
        "domain": "BUSINESS-DEVELOPMENT",
        "uri": "../data/master_data/resumes/v1.0/BUSINESS-DEVELOPMENT/18236085.pdf"
    },
    {
        "distance": 48.81,
        "resume_id": 43311839,
        "domain": "CONSULTANT",
        "uri": "../data/master_data/resumes/v1.0/CONSULTANT/43311839.pdf"
    },
    {
        "distance": 48.81,
        "resume_id": 11637468,
        "domain": "BANKING",
        "uri": "../data/master_data/resumes/v1.0/BANKING/11637468.pdf"
    },
    {
        "distance": 48.8,
        "resume_id": 62994611,
        "domain": "AGRICULTURE",
        "uri": "../data/master_data/resumes/v1.0/AGRICULTURE/62994611.pdf"
    }
]


## Disconnect Milvus DB Connection

In [36]:
connections.disconnect(alias="default")