In [1]:
# !pip install -qU \
#   "pinecone-client[grpc]"==2.2.1 \
#   datasets==2.12.0 \
#   sentence-transformers==2.2.2

In [28]:
from datasets import load_dataset

docs = load_dataset(f"Cohere/wikipedia-22-12-hi-embeddings", split="train[0:1024]")

Found cached dataset parquet (/Users/nirantk/.cache/huggingface/datasets/Cohere___parquet/Cohere--wikipedia-22-12-hi-embeddings-9492772e96dab9c4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [29]:
docs

Dataset({
    features: ['id', 'title', 'text', 'url', 'wiki_id', 'views', 'paragraph_id', 'langs', 'emb'],
    num_rows: 1024
})

In [53]:
docs[0]["text"]

'भारत का संविधान, भारत का सर्वोच्च विधान है जो संविधान सभा द्वारा 26 नवम्बर 1949 को पारित हुआ तथा 26 जनवरी 1950 से प्रभावी हुआ। यह दिन (26 नवम्बर) भारत के संविधान दिवस के रूप में घोषित किया गया है |जबकि 26 जनवरी का दिन भारत में गणतन्त्र दिवस के रूप में मनाया जाता है।'

In [30]:
emb_sz = len(docs["emb"][0])

In [31]:
docs_list = docs.to_list()

In [34]:
for k in docs_list[0]:
    print(k)

id
title
text
url
wiki_id
views
paragraph_id
langs
emb


In [9]:
dataset = load_dataset("quora", split="train[240000:320000]")
dataset

Found cached dataset quora (/Users/nirantk/.cache/huggingface/datasets/quora/default/0.0.0/36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04)


Dataset({
    features: ['questions', 'is_duplicate'],
    num_rows: 80000
})

In [10]:
import os
import pinecone

# get api key from app.pinecone.io
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY") or "a5ef6eb7-b753-45d7-86db-c1faed66895f"
# find your environment next to the api key in pinecone console
PINECONE_ENV = os.environ.get("PINECONE_ENVIRONMENT") or "us-east1-gcp"

pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

In [13]:
index_name = "hindi-search"

# only create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(name=index_name, dimension=emb_sz, metric="cosine")

# now connect to the index
index = pinecone.GRPCIndex(index_name)

In [15]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [16]:
index.name

'hindi-search'

In [48]:
from tqdm.auto import tqdm
import numpy as np

batch_size = 128

for i in tqdm(range(0, len(docs_list), batch_size)):
    # find end of batch
    i_end = min(i + batch_size, len(docs_list))
    # create metadata batch
    chunk = docs[i:i_end]
    metadatas = [{"text": text} for text in chunk["text"]]
    ids = [str(x) for x in chunk["id"]]
    #     print(ids)
    #     assert isinstance(xc, list)
    emb = np.array(chunk["emb"])
    records = zip(ids, emb, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

# check number of records in the index
index.describe_index_stats()

  0%|          | 0/8 [00:00<?, ?it/s]

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1024}},
 'total_vector_count': 1024}

In [49]:
index.describe_index_stats()["dimension"]

768