##### `Import Library`

In [21]:
import os
from dotenv import load_dotenv
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Vector DB
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, Batch, Filter, MatchValue, FieldCondition 

## Loading Data

In [10]:
data_path = os.path.join(os.getcwd(), 'dataset-semantic', 'articles_new.csv')
df = pd.read_csv(data_path)
df['class'] = ['class-a', 'class-b'] * 250
df

Unnamed: 0,title,id,class
0,Mental Note Vol. 24,3054,class-a
1,Your Brain On Coronavirus,3055,class-b
2,Mind Your Nose,3056,class-a
3,The 4 Purposes of Dreams,3057,class-b
4,Surviving a Rod Through the Head,3058,class-a
...,...,...,...
495,Is It Worth to Invest In Mobile E-commerce App...,3549,class-b
496,Let go of these things for a happier 2021,3550,class-a
497,Not Everyone Will like Your Writing,3551,class-b
498,Is Technology Neutral?,3552,class-a


In [11]:
model_hugging = SentenceTransformer(model_name_or_path='all-MiniLM-L6-v2', device='cpu')
model_hugging

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [12]:
vect_length = len(model_hugging.encode(df['title'][0]))
print('Length of Embedding model: {}'.format(vect_length))

Length of Embedding model: 384


In [13]:
_ = load_dotenv(override=True)
QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
QDRANT_URL = os.getenv('QDRANT_URL')

### Qdrant In Code

In [14]:
# connect to Qdrant
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

# Collecting configration
collection_config = VectorParams(
    size=vect_length,
    distance=Distance.COSINE, 
    on_disk=True
)


## Create a collection
try:
    collec_name = 'course'
    client.create_collection(collection_name=collec_name, vectors_config=collection_config)
    print('Collection Craeted Successfuly')
except:
    print(f'Collection {collec_name} Already Exist.')

Collection course Already Exist.


In [15]:
## Check Status of Collection
collection_status = client.get_collection(collection_name=collec_name).status
collection_count_vectors = client.get_collection(collection_name=collec_name).vectors_count

print(f'Status is: {collection_status}')
print(f'Vectors Count is: {collection_count_vectors}')

Status is: green
Vectors Count is: None


### Upserting to Qdrant

In [16]:
## Function for upserting data to Qdrant
def upsert_to_qdrant(df, batch_size=32):

    ## A list for failed_ids
    failed_ids = []

    for batch_start in tqdm(range(0, len(df), batch_size)):

        try:
            ## Prepare batches
            batch_end = min(batch_start+batch_size, len(df))
            titles_batch = df['title'][batch_start: batch_end].tolist()
            ids_batch = df['id'][batch_start: batch_end].tolist()     ## No need to be converted to string (Qdrant need integer)
            
            ## Payload
            payload_batch = [{'class': cls} for cls in df['class'][batch_start: batch_end].tolist()]

            ## Get Embeddings using HuggingFace model
            embeds_batch = model_hugging.encode(titles_batch).tolist()

            ## Prepare to Qdrant
            to_upsert = Batch(ids=ids_batch, vectors=embeds_batch, payloads=payload_batch)

            ## Upsert to Qdrant
            client.upsert(collection_name=collec_name, wait=True, points=to_upsert)


        except Exception as e:
            print(f'Error in upserting: {e}')
            failed_ids.append(ids_batch)

    return failed_ids


## Apply the function
failed_ids = upsert_to_qdrant(df=df, batch_size=32)

100%|██████████| 16/16 [00:29<00:00,  1.87s/it]


In [19]:
## Check Status of Collection
collection_status = client.get_collection(collection_name=collec_name).status
collection_count_vectors = client.get_collection(collection_name=collec_name).points_count

print(f'Status is: {collection_status}')
print(f'Vectors Count is: {collection_count_vectors}')

Status is: green
Vectors Count is: 500


### Query In Qdrant

In [23]:
query_text = 'Neutral Technology'

# Embedding using model
query_embeds = model_hugging.encode(query_text).tolist()

# Search In Pinecone
result = client.search(collection_name=collec_name,
                       query_vector=query_embeds,
                       limit=5, score_threshold=0.25,
                       query_filter=Filter(must=[FieldCondition(key='class', match=MatchValue(value='class-a'))]))
result

[ScoredPoint(id=3552, version=27, score=0.77367496, payload={'class': 'class-a'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3368, version=21, score=0.36533967, payload={'class': 'class-a'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3524, version=26, score=0.27010286, payload={'class': 'class-a'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=3246, version=18, score=0.2518342, payload={'class': 'class-a'}, vector=None, shard_key=None, order_value=None)]

### Delete In Qdrant