In [5]:
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm import tqdm

# Specify the model name and local directory for saving

local_model_path = "./huggingFaceModels/paraphrase-multilingual-MiniLM-L12-v2"

#model_name = "mixedbread-ai/mxbai-embed-large-v1"
#model_name='microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract'
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    cache_folder=local_model_path
)



In [6]:
import json


docs = []

with open('newFrenchList.json', 'r') as file:
    docs = json.load(file)

print(docs)

{'White Blood Cell (WBC) count': ['numération des leucocytes', 'compte des globules blancs', 'GB'], 'Red Blood Cell (RBC) count': ['numération des globules rouges', 'compte des érythrocytes', 'GR'], 'Hemoglobin (Hb)': ['hémoglobine', 'globine', 'HB'], 'Hematocrit (Hct)': ['hématocrite', 'taux de cellules sanguines', 'HT'], 'Mean Corpuscular Volume (MCV)': ['volume corpusculaire moyen', 'VCM'], 'Mean Corpuscular Hemoglobin (MCH)': ['hémoglobine corpusculaire moyenne', 'HCM'], 'Mean Corpuscular Hemoglobin Concentration (MCHC)': ['concentration en hémoglobine corpusculaire moyenne', 'CHCM'], 'Platelet count': ['numération plaquettaire', 'compte des thrombocytes', 'Plaquettes'], 'Mean Platelet Volume (MPV)': ['volume plaquettaire moyen', 'VPM'], 'Red Cell Distribution Width (RDW)': ['largeur de distribution des globules rouges', 'RDW'], 'Neutrophils': ['neutrophiles', 'polynucléaires neutrophiles', 'PN'], 'Lymphocytes %': ['lymphocytes %', 'pourcentage lymphocytes', 'cellules lymphoïdes'],

In [7]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
import asyncio

url = "100.108.14.46"
client = QdrantClient(url)
collection_exists = client.collection_exists(
    collection_name="biologyParametersStore",
   
    )
if not collection_exists :
    client.create_collection(
        collection_name="biologyParametersStore",
        vectors_config=VectorParams(size=384, distance=Distance.COSINE)
        )
    
vectors = []


    
for key in docs:
    for parameter in docs[key]:
        # Embed the 'content' property
        vector = await hf.aembed_query(parameter)
        
        # Add other properties as metadata in the payload
        payload = {
            "parameterName": key,
            'frenchNames' : parameter
        }
        vectors.append({'payload': payload, 'vector': vector})
        # Insert vector and metadata into Qdrant
print(vectors[0])

client.upsert(
    collection_name="biologyParametersStore",
    points=[
        PointStruct(id=idx,payload=vec['payload'], vector=vec['vector'])
        for idx, vec in enumerate(vectors)
    ]
)



{'payload': {'parameterName': 'White Blood Cell (WBC) count', 'frenchNames': 'numération des leucocytes'}, 'vector': [-0.22009702026844025, 0.10764839500188828, -0.2657281458377838, -0.05216720700263977, -0.06085629388689995, -0.310315877199173, 0.13154655694961548, 0.27743688225746155, 0.1815439909696579, -0.07791664451360703, 0.14397795498371124, -0.32529789209365845, 0.114191435277462, 0.016504857689142227, -0.3059043884277344, -0.10376081615686417, -0.3693679869174957, 0.151881605386734, -0.4423394799232483, -0.08655091375112534, 0.23028771579265594, 0.09714654088020325, 0.10321376472711563, 0.2560408115386963, 0.09172146767377853, -0.17568762600421906, -0.11752015352249146, -0.17567700147628784, -0.0716068297624588, -0.2830509543418884, 0.1250058114528656, 0.023569706827402115, 0.02750490978360176, 0.010335625149309635, -0.03592571243643761, -0.04944963753223419, -0.007683273404836655, 0.14887836575508118, 0.06082117557525635, -0.06241949647665024, 0.20305080711841583, 0.150640398

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [8]:

query_vector = await hf.aembed_query('T3')

results = client.search(
    collection_name="biologyParametersStore",
    query_vector=query_vector,
    limit=1,
    score_threshold=0.75
)
formatted_results = [
    {
        "id": result.id,
        "score": result.score,
        "payload": result.payload,
    }
    for result in results
]
print(formatted_results)

[{'id': 110, 'score': 0.8318274, 'payload': {'parameterName': 'Free Triiodothyronine (Free T3)', 'frenchNames': 'T3 libre'}}]
