In [None]:
!pip install -qU datasets transformers sentence-transformers git+https://git@github.com/pinecone-io/pinecone-python-client.git#egg=pinecone-client[grpc] git+https://github.com/naver/splade.git

In [None]:
import pandas as pd
data = pd.read_csv('/kaggle/input/constitution-of-india/Constitution Of India.csv')
data

In [None]:
laws = list(data['Articles'])
for i in range(len(laws)):
    laws[i] = 'Article ' + laws[i]

In [None]:
data = []
# for record in pubmed:
#     chunks = chunker(record['context']['contexts'])
#     for i, context in enumerate(chunks):
#         data.append({
#             'id': f"{record['pubid']}-{i}",
#             'context': context
#         })
for i, context in enumerate(laws):
        data.append({
            'id': f"{i}",
            'context': context
        })

data[:10]

In [None]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# check device being run on
if device != 'cuda':
    print("==========\n"+
          "WARNING: You are not running on GPU so this may be slow.\n"+
          "If on Google Colab, go to top menu > Runtime > Change "+
          "runtime type > Hardware accelerator > 'GPU' and rerun "+
          "the notebook.\n==========")

dense_model = SentenceTransformer(
    'msmarco-bert-base-dot-v5',
    device=device
)
dense_model

In [None]:
from splade.models.transformer_rep import Splade

sparse_model_id = 'naver/splade-cocondenser-ensembledistil'

sparse_model = Splade(sparse_model_id, agg='max')
sparse_model.to(device)  # move to GPU if possible
sparse_model.eval()

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(sparse_model_id)

tokens = tokenizer(data[0]['context'], return_tensors='pt')

In [None]:
with torch.no_grad():
    sparse_emb = sparse_model(
        d_kwargs=tokens.to(device)
    )['d_rep'].squeeze()
sparse_emb.shape

In [None]:
indices = sparse_emb.nonzero().squeeze().cpu().tolist()
values = sparse_emb[indices].cpu().tolist()
sparse = {'indices': indices, 'values': values}
sparse

In [None]:
idx2token = {idx: token for token, idx in tokenizer.get_vocab().items()}

sparse_dict_tokens = {
    idx2token[idx]: round(weight, 2) for idx, weight in zip(indices, values)
}
# sort so we can see most relevant tokens first
sparse_dict_tokens = {
    k: v for k, v in sorted(
        sparse_dict_tokens.items(),
        key=lambda item: item[1],
        reverse=True
    )
}
sparse_dict_tokens

In [None]:
import pinecone


def builder(records: list):
    ids = [x['id'] for x in records]
    contexts = [x['context'] for x in records]
    # create dense vecs
    dense_vecs = dense_model.encode(contexts).tolist()
    # create sparse vecs
    input_ids = tokenizer(
        contexts, return_tensors='pt',
        padding=True, truncation=True
    )
    with torch.no_grad():
        sparse_vecs = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    # convert to upsert format
    upserts = []
    for _id, dense_vec, sparse_vec, context in zip(ids, dense_vecs, sparse_vecs, contexts):
        # extract columns where there are non-zero weights
        indices = sparse_vec.nonzero().squeeze().cpu().tolist()  # positions
        values = sparse_vec[indices].cpu().tolist()  # weights/scores
        # build sparse values dictionary
        sparse_values = {
            "indices": indices,
            "values": values
        }
        # build metadata struct
        metadata = {'context': context}
        # append all to upserts list as pinecone.Vector (or GRPCVector)
        upserts.append({
            'id': _id,
            'values': dense_vec,
            'sparse_values': sparse_values,
            'metadata': metadata
        })
    return upserts

In [None]:
import pinecone

pinecone.init(
    api_key="f112db94-1b02-44ec-b1d7-a4cf165fad28",  # app.pinecone.io
    environment="us-east1-gcp"  # next to api key in console
)

In [None]:
index_name = 'law-gpt'

pinecone.create_index(
    index_name,
    dimension=768,
    metric="dotproduct",
    pod_type="s1"
)

In [None]:
index = pinecone.GRPCIndex(index_name)
index.describe_index_stats()

In [None]:
from tqdm.auto import tqdm

batch_size = 64

for i in tqdm(range(0, len(data), batch_size)):
    # extract batch of data
    i_end = min(i+batch_size, len(data))
    batch = data[i:i_end]
    # pass data to builder and upsert
    index.upsert(builder(data[i:i+batch_size]))

In [None]:
len(data), index.describe_index_stats()

In [None]:
def encode(text: str):
    # create dense vec
    dense_vec = dense_model.encode(text).tolist()
    # create sparse vec
    input_ids = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        sparse_vec = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    # convert to dictionary format
    indices = sparse_vec.nonzero().squeeze().cpu().tolist()
    values = sparse_vec[indices].cpu().tolist()
    sparse_dict = {"indices": indices, "values": values}
    # return vecs
    return dense_vec, sparse_dict

In [None]:
query = "Who is the Chief Justice of High Court when the seat is vacant"
dense, sparse = encode(query)
# query
xc = index.query(
    vector=dense,
    sparse_vector=sparse,
    top_k=2,  # how many results to return
    include_metadata=True
)
xc