Register models on Azure

In [14]:
from packages.azureml_functions import get_ws
from sentence_transformers import SentenceTransformer, CrossEncoder

ws = get_ws()
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [2]:
bi_encoder.save('models/bi_encoder')
cross_encoder.save('models/cross_encoder')

In [3]:
from azureml.core import Model

Model.register(workspace=ws, model_name="untrained_bi_encoder", model_path='models/bi_encoder')
Model.register(workspace=ws, model_name="untrained_cross_encoder", model_path='models/cross_encoder')


Registering model untrained_bi_encoder
Registering model untrained_cross_encoder


Model(workspace=Workspace.create(name='SemanticSearch_TRAIN', subscription_id='9da3a5d6-6bf3-4b2c-8219-88caf39f718d', resource_group='Semantic_Search'), name=untrained_cross_encoder, id=untrained_cross_encoder:1, version=1, tags={}, properties={})

Upload a example dataset of context embeddings

In [30]:
import json
import numpy as np
import pandas as pd
from datasets import Dataset
import lorem 
import os

corpus_size = 1000000
embeddings_size = 384

step_size = 1
with open('mock_data.jsonl', 'w') as f:
    for size in range(0, 1000000, step_size):
        df = {}
        fake_embeddings = np.random.randn(step_size, embeddings_size)
        fake_embeddings = np.float32(fake_embeddings)
        df['embedding'] = fake_embeddings.tolist()
        df['_id'] = ["file/path_"+str(size + i) for i in range(step_size)]
        df['text'] = [lorem.paragraph() for i in range(step_size)]

        f.write(json.dumps(df) + "\n")

In [2]:
df.columns

Index(['embedding', '_id', 'text'], dtype='object')

In [1]:
# embeddings_ds = Dataset.from_pandas(df)
from datasets import load_dataset
embeddings_ds = load_dataset("json", data_files=["mock_data.jsonl"])
embeddings_ds = embeddings_ds['train']
embeddings_ds.save_to_disk("mock_dataset")
print(embeddings_ds)

  from .autonotebook import tqdm as notebook_tqdm
Using custom data configuration default-f549c43b6f62b38c


Downloading and preparing dataset json/default to /home/vscode/.cache/huggingface/datasets/json/default-f549c43b6f62b38c/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 2686.93it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 106.96it/s]
                                  

Dataset json downloaded and prepared to /home/vscode/.cache/huggingface/datasets/json/default-f549c43b6f62b38c/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00,  2.92it/s]


Dataset({
    features: ['embedding', '_id', 'text'],
    num_rows: 1000000
})


In [7]:
from datasets import load_from_disk

def make_numpy(example):
    example['embedding'] = np.array(example['embedding'], dtype=np.float32)
    return example

embeddings_ds = load_from_disk('mock_dataset')
embeddings_ds = embeddings_ds.map(make_numpy)


100%|██████████| 1000000/1000000 [03:47<00:00, 4392.66ex/s]


In [None]:
type(embeddings_ds["embedding"][0])

In [8]:
from math import sqrt
import numpy as np
from datasets import load_from_disk

if 'corpus_size' not in locals():
    corpus_size = 1000000

if embeddings_ds is None:
    print('load dataset from file')
    embeddings_ds = load_from_disk('mock_dataset')

string_factory = f"IVF{int(4*sqrt(corpus_size))},Flat"
# string_factory = f"IVF65536_HNSW32{int(4*sqrt(corpus_size))},Flat"
embeddings_ds.add_faiss_index(column='embedding', string_factory=string_factory, train_size=corpus_size)

ValueError: did not recognize array type

In [6]:
embeddings_ds.save_faiss_index('embedding', 'mock_dataset/embedding_index.faiss')

Upload data and index

In [10]:
from glob import glob
from azure.storage.blob import BlobServiceClient
from azure.core.exceptions import ResourceExistsError

def write_to_blob(workspace, local_path, upload_path):
    datastore = workspace.get_default_datastore()
    
    account_name = datastore.account_name
    account_key = datastore.account_key

    connection_string = f'DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net'

    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    container_client = blob_service_client.get_container_client(datastore.container_name)

    upload_file = open(local_path, 'rb')
    try:
        print(f"Writing result from {local_path} to {datastore.name}")
        container_client.upload_blob(name=upload_path, data=upload_file)
    except ResourceExistsError:
        print('file name already used')


files = glob("mock_dataset/*")
for f in files:
    write_to_blob(workspace=ws, local_path=f, upload_path=f)


Writing result from mock_dataset/state.json to workspaceblobstore
Writing result from mock_dataset/dataset_info.json to workspaceblobstore
Writing result from mock_dataset/embedding_index.faiss to workspaceblobstore
Writing result from mock_dataset/dataset.arrow to workspaceblobstore


Load in dataset with faiss index

In [13]:
from datasets import load_from_disk
import os
from packages.azureml_functions import get_ws
from azure.storage.blob import BlobServiceClient
from azure.core.exceptions import ResourceExistsError

ws = get_ws()
datastore = ws.get_default_datastore()
account_name = datastore.account_name
account_key = datastore.account_key
connection_string = f'DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net'

blob_service_client = BlobServiceClient.from_connection_string(connection_string)

container_client = blob_service_client.get_container_client('azureml-blobstore-04f263a3-f5c3-4204-9686-0f886c746b64')
blob_gen = container_client.list_blobs(name_starts_with="corpus_ds_with_embedding/corpus_2021-2021_improved/")

for b in blob_gen:
    # step 1: Download the file to local storage
    file_name = b.name.rsplit('/',1)[-1]
    print("Download:", file_name)

    location = "corpus_embeddings"
    os.makedirs(location, exist_ok=True)
    target_path = os.path.join(location, file_name)
    with open(target_path, 'wb') as f:
        f.write(container_client.download_blob(b.name).readall())  


ds = load_from_disk('corpus_embeddings/')
ds.load_faiss_index('embedding', "corpus_embeddings/IVF1922_Flat.faiss")

print(ds)
print(ds.get_index('embedding').faiss_index)

Download: IVF1922_Flat.faiss
Download: dataset.arrow
Download: dataset_info.json
Download: state.json
Dataset({
    features: ['_id', 'doc_id', 'title', 'paragraph_id', 'text', 'chunked', 'embeddings'],
    num_rows: 230885
})
<faiss.swigfaiss.IndexIVFFlat; proxy of <Swig Object of type 'faiss::IndexIVFFlat *' at 0x7f7d8c46e030> >


In [15]:
query = ['When do I retire?']

query_embedding = bi_encoder.encode(query, convert_to_tensor=True)

scores, samples = ds.get_nearest_examples('embedding', query_embedding.cpu().detach().numpy(), k=5)

In [38]:
print(samples.keys())

cross_input = list(map(lambda x: query + [x], samples['text']))
print('_'*100)
output = cross_encoder.predict(cross_input, convert_to_numpy=True)

print(scores)
print(output)
indices = output.argsort()[::-1]

def select(indexs, items):
    return [items[i] for i in indexs]

res_dict = {k: select(indices, v) for k,v in samples.items()}

print('_'*100)
print(indices)
print(samples['_id'])
print(res_dict['_id'])

dict_keys(['_id', 'doc_id', 'title', 'paragraph_id', 'text', 'chunked', 'embeddings'])
____________________________________________________________________________________________________
[0.85465294 0.90902317 0.91939294 0.9937544  0.9987192 ]
[ 0.34544706  3.4341395  -2.6393273  -4.3905954  -1.6300539 ]
____________________________________________________________________________________________________
[1 0 4 2 3]
[167484, 167377, 167777, 67919, 167802]
[167377, 167484, 167802, 167777, 67919]
