In [1]:
import glob
import sys
import os
import json
import pickle
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer

root_path = '/home/ec2-user/sarang/wiki_cheat'

sys.path.insert(0, os.path.abspath(root_path))
os.chdir(root_path)

In [None]:
import openai
### Load the environment file
env_file_path = find_dotenv()
logging.info(f'env_file_path: {env_file_path}')
load_dotenv(env_file_path)

### API keys and tokens needed for interaction with external APIS
openai.api_key = os.getenv('OPENAI_API_KEY')
huggingface_token = os.getenv('HF_TOKEN')

# Load Data dump

In [2]:
%%time
data_path = 'data/wikipedia-22-12-simple-cohere-small.pkl'
with open('data/wikipedia-22-12-simple-cohere-small.pkl', 'rb') as fp:
    wiki_data = pickle.load(fp)

CPU times: user 364 ms, sys: 151 ms, total: 516 ms
Wall time: 515 ms


In [3]:
wiki_data[-1]

{'id': 485858,
 'title': '1970s in American television',
 'text': 'Many prime-time programs of the 1970s were helmed by independent producers, often in association with a major Hollywood studio. A particularly successful independent producer at the dawn of the decade was Quinn Martin. Martin produced crime shows such as "The Streets of San Francisco", "Cannon", and "Dan August". The latter show was a one-season cop series, starring an up-and-coming actor Burt Reynolds, that would find greater popularity after its star had hit it big at the cinema. Martin\'s success would fade during the last half of the decade. By the end of 1980, the former mega-producer would be left without a single prime-time network series on the air. As for other producers during the period, Norman Lear produced the socially relevant "All in the Family" as well as more innocuous fare such as "One Day at a Time". Jack Webb, a holdover from previous decades, oversaw "Adam-12" and "Emergency!. Glen A. Larson produce

# Load the Encoder model and compute the embeddings. This step takes ~13 mins for 500k passages on a A10 node with 512 as the batch size

In [3]:
# model_path = 'sentence-transformers/all-mpnet-base-v2'
model_path = 'train_embedder/models/sentence-transformers-all-mpnet-base-v2-2024-01-27_20-14-10'
encoder_model = SentenceTransformer(model_path, device='cuda')

In [4]:
wiki_data_text = [ dp['text'] for dp in wiki_data]

In [5]:
%%time
vector=encoder_model.encode(wiki_data_text, show_progress_bar=True, batch_size=512).tolist()

Batches:   0%|          | 0/949 [00:00<?, ?it/s]

CPU times: user 16min, sys: 18.4 s, total: 16min 18s
Wall time: 13min 27s


In [6]:
len(vector)

485859

# Use Qdrant to store the embeddings. Create collection and upload records

#### On a small free tier 1 GB ram machine, the on_disk parameter had to set to True and memmap_threshold was set to 200000 .
#### Have a look at this for more details - https://qdrant.tech/documentation/concepts/storage/#configuring-memmap-storage 

In [None]:
qdrant_client = QdrantClient(
    url="", 
    api_key="",
)

In [39]:
qdrant_client.create_collection(
    collection_name="wiki_all-mpnet-base-v2",
    vectors_config=models.VectorParams(
        size=768, distance=models.Distance.COSINE, on_disk=True,
    ),
    optimizers_config=models.OptimizersConfigDiff(memmap_threshold=20000),
)

True

In [22]:
qdrant_client.recreate_collection(
    collection_name="wiki_all-mpnet-base-v2",
    vectors_config=models.VectorParams(
        size=encoder_model.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

True

In [40]:
%%time
qdrant_client.upload_records(
    collection_name="wiki_all-mpnet-base-v2",
    records=[
        models.Record(
            id=idx, vector=vector[idx], payload=doc
        )
        for idx, doc in enumerate(wiki_data)
    ],
)



CPU times: user 3min 46s, sys: 12.3 s, total: 3min 58s
Wall time: 27min 37s


In [None]:
hits = qdrant_client.search(
    collection_name="wiki_all-mpnet-base-v2",
    query_vector=model.encode("alien invasion").tolist(),
    limit=3,
)
for hit in hits:
    print(hit.payload, "score:", hit.score)

# Use Pinecone serverless instead. Provides 100$ of free credit. Should be good enough for this application

### Upload takes about ~19 mins with batch size 100 for ~500k datapoints

In [7]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'), pool_threads=30)

In [8]:
pc.create_index(
    name="wiki-all-mpnet-base-v2-trained",
    dimension=encoder_model.get_sentence_embedding_dimension(),
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws",
        region="us-west-2"
    ) 
)

In [9]:
upsert_data = []
for idx, dp in enumerate(wiki_data):
    upsert_data.append({
        "id": str(dp['id']),
        "values": vector[idx],
        "metadata": dp
    })

In [10]:
%%time
import random
import itertools
from pinecone import Pinecone

def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

with pc.Index('wiki-all-mpnet-base-v2-trained', pool_threads=30) as index:
    # Send requests in parallel
    async_results = [
        index.upsert(vectors=ids_vectors_chunk, async_req=True)
        for ids_vectors_chunk in chunks(upsert_data, batch_size=100)
    ]
    # Wait for and retrieve responses (this raises in case of error)
    [async_result.get() for async_result in async_results]

CPU times: user 19min 5s, sys: 12.8 s, total: 19min 17s
Wall time: 19min 7s
