In [4]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import time
import os
import streamlit as st


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
pc = Pinecone(api_key=st.secrets['PC_API_KEY'])

In [6]:
# Read the data to be uploaded
csv_file="data/clean_akgec_data.csv"  # provide the file locaton to be upload 
def read_doc(txt_file):
    loader = CSVLoader(file_path=txt_file,
        csv_args={
        'delimiter': ',',
        'quotechar': '"',
        'fieldnames': ['Link', 'content']
    })
    document = loader.load()
    return document
document = read_doc(csv_file)

# Chunking the document
def chunk_data(docs, size=1100, overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)
    doc = text_splitter.split_documents(docs)
    return doc
chunks = chunk_data(document)
print(len(chunks))
# print(chunks)

974


In [7]:
# Create the index if not present
index_name = "akgec-data"
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    )

while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

# Make instance of index
index = pc.Index(index_name)

In [8]:
# DEMO OF THE EMBEDDING FINTION
# Funtion to create embeddings
## (end - start) must be smaller than 90. Do not try to enbed more than 90 embeddings at a time. 
def create_embeddings(start, end):
    embeddings = pc.inference.embed(
        model="multilingual-e5-large",
        inputs=[d.page_content for d in chunks[start:end]],
        parameters={
            "input_type": "passage", 
            "truncate": "END"
        }
    )
    return embeddings

embeddings = create_embeddings(0, 1)
print(embeddings)
print(len(embeddings.data[0]['values']))

EmbeddingsList(
  model='multilingual-e5-large',
  vector_type='dense',
  data=[
    {'vector_type': dense, 'values': [0.042144775390625, 0.00269317626953125, ..., -0.0239105224609375, -4.750490188598633e-05]}
  ],
  usage={'total_tokens': 10}
)
1024


In [9]:
# DEMO Create funtion for docs to create vectorstore.
def create_doc():
    doc = []
    for idx, text in enumerate(chunks):
        doc.append({"id":str(idx), "text": text.page_content})
    return doc

docs = create_doc()
print(docs[1])

{'id': '1', 'text': 'Link: https://www.akgec.ac.in/ieee-student-branch-chapter'}


## The Create funtion uses two funtion:
#### 1. Creating doc file to make record that has no limit to input size.
#### 2. Embeddings funtion that has limit of only 90 embedding at a time.

In [90]:
# CREATE THE FINAL RECORD TO BE UPLOADED.
## (end - start) must be smaller than 90. Do not try to create more than 90 records at a time due to embedding limits. 

def create_rec(start, end):
    doc = []
    # Creating the document file to be used to create vectorstore
    for idx, text in enumerate(chunks):
        doc.append({"id":str(idx), "text": text.page_content})

    # Funtion to create embeddings
    embeddings = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[d.page_content for d in chunks[start:end]],
    parameters={
        "input_type": "passage", 
        "truncate": "END"
    }
    )
    
    # Record to be uploaded.
    records = []
    for d, e in zip(doc[start:end], embeddings):
        records.append({
            "id": d["id"],
            "values": e["values"],
            "metadata": {
                "source_text": d["text"]
            }
        })
    return records

In [None]:
## Upload the content one by one 

# start = 0
# end = 1
# records = create_rec(start, end)
# print(records[0])
# index.upsert(
#     vectors=records,
#     namespace="doc1")

{'id': '0', 'values': [0.02099609375, -0.02545166015625, 0.021484375, -0.045257568359375, 0.05078125, -0.06427001953125, -0.0111846923828125, 0.07403564453125, 0.0361328125, -0.0242462158203125, 0.03179931640625, 0.00411224365234375, -0.046356201171875, -0.0142364501953125, -0.023284912109375, -0.002227783203125, -0.005405426025390625, 0.020782470703125, 0.004650115966796875, -0.024871826171875, 0.018890380859375, 0.004421234130859375, -0.0361328125, -0.034881591796875, 0.0176544189453125, 0.0004203319549560547, -0.043182373046875, -0.04437255859375, -0.01407623291015625, -0.045501708984375, 0.01212310791015625, -0.0035648345947265625, -0.0236663818359375, -0.040283203125, -0.02154541015625, 0.00907135009765625, 0.053619384765625, 0.034637451171875, -0.036834716796875, 0.03631591796875, -0.0041961669921875, 0.062286376953125, 0.0090179443359375, -0.04754638671875, -0.027984619140625, 0.03009033203125, 0.0219268798828125, -0.0095062255859375, 0.001178741455078125, 0.042755126953125, 0.0

upserted_count: 1

In [None]:
# # Creating funtion to upload data automatically.
# # INCOMPLETE.

# l = (int(int(input("num"))/100))*100
# lent = max(int(l/5), 50)
# print(lent)
# for i in range(0, l, lent):
#     for j in range(i, i+lent, 50):
#         print(j, j+50)
#     print("------------------") 
    

50
0 50
------------------
50 100
------------------
100 150
------------------
150 200
------------------


In [None]:
# INCOMPLETE
# Upload records.

record_num = len(chunks)
for j in range(0, record_num, 250):
    vectors = []
    for i in range(j, j+250, 50):
        rec = create_rec(i, i+50)
        vectors += rec
    index.upsert(
    vectors=vectors,
    namespace="doc1"
)

In [10]:
query = "where is akgec"

query_embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

In [11]:
results = index.query(
    namespace="doc1",
    vector=query_embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)
print(results)

{'matches': [{'id': '130',
              'metadata': {'source_text': 'Keynote address on Mobile Computing '
                                          'in Feb 2014 at Annual Convention of '
                                          'JIMS. He is Life Fellow of the IETE '
                                          'and attended international '
                                          'conferences held in France, '
                                          'Singapore, USA, Hong Kong and '
                                          'Nepal. Daily practices advanced '
                                          'Art-of-Living meditation. Also '
                                          'associated with Amway and Insurance '
                                          'sector. Contact Details : +91 '
                                          '9868041558 Email Id : '
                                          'editor_journal@akgec.ac.in, '
                                          'akgec.ece@gmail.co

In [12]:
results["matches"][1]['metadata']['source_text']

'College (AKGEC) 27th Km Stone, Delhi-Meerut Expressway, Ghaziabad, Uttar Pradesh  201015 Email: srivastavass@akgec.ac.in Mobile: 9818590621'