In [53]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import time
import streamlit as st


In [54]:
pc = Pinecone(api_key=st.secrets['PC_API_KEY'])

In [55]:
# Read the data to be uploaded
csv_file="data\\akg_data.csv"  # provide the file locaton to be upload 
def read_doc(txt_file):
    loader = CSVLoader(file_path=txt_file,
        csv_args={
        'delimiter': ',',
        'quotechar': '"',
        'fieldnames': ['Link', 'content']
    })
    document = loader.load()
    return document
document = read_doc(csv_file)

# Chunking the document
def chunk_data(docs, size=1800, overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)
    doc = text_splitter.split_documents(docs)
    return doc
chunks = chunk_data(document)
print("Number of chunks:", len(chunks))

# Taking the fact that the average length of a word is 4.7 characters
avg_words = 0
for i in range(len(chunks)):
    word = len(chunks[i].page_content)/4.7
    avg_words += word
print("Number of words:", int(avg_words))
print("average words in a chunk:", avg_words/len(chunks))


Number of chunks: 1828
Number of words: 550905
average words in a chunk: 301.37064109129756


In [56]:
# # search the chunks with their size.

# num = 0
# for i in range(len(chunks)):
#     if len(chunks[i].page_content) < 100:
#         num += 1 
#         print(len(chunks[i].page_content), chunks[i].page_content)
# print(num)

In [57]:
# Create the index if not present
index_name = "akgec-data"
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    )

while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

# Make instance of index
index = pc.Index(index_name)

In [58]:
"""DEMO FINTIONS"""

# # Funtion to create embeddings
# ## (end - start) must be smaller than 90. Do not try to enbed more than 90 embeddings at a time. 
# def create_embeddings(start, end):
#     embeddings = pc.inference.embed(
#         model="multilingual-e5-large",
#         inputs=[d.page_content for d in chunks[start:end]],
#         parameters={
#             "input_type": "passage", 
#             "truncate": "END"
#         }
#     )
#     return embeddings

# embeddings = create_embeddings(0, 1)
# print(embeddings)
# print(len(embeddings.data[0]['values']))

# # DEMO funtion that display the structure of data to be upsert to vectorDB.
# def create_doc():
#     doc = []
#     for idx, text in enumerate(chunks):
#         doc.append({"id":str(idx), "text": text.page_content})
#     return doc
# docs = create_doc()
# print(docs[1])

'DEMO FINTIONS'

>## The Create_rec funtion uses two funtion:
#### 1. Creating doc file to make record that has no limit to input size.
#### 2. Embeddings funtion that has limit of only 90 embedding at a time.

In [59]:
# CREATE THE FINAL RECORD TO BE UPLOADED.
## (end - start) must be smaller than 90. Do not try to create more than 90 records at a time due to embedding limits. 

def create_rec(start, end):
    doc = []
    # Creating the document file to be used to create vectorstore
    for idx, text in enumerate(chunks):
        doc.append({"id":str(idx), "text": text.page_content})

    # Funtion to create embeddings
    embeddings = pc.inference.embed(
        model="llama-text-embed-v2",
        # model="multilingual-e5-large",
        inputs=[d.page_content for d in chunks[start:end]],
        parameters={
            "input_type": "passage", 
            "truncate": "END"
        }
    )
    
    # Record to be uploaded.
    records = []
    for d, e in zip(doc[start:end], embeddings):
        records.append({
            "id": d["id"],
            "values": e["values"],
            "metadata": {
                "source_text": d["text"]
            }
        })
    return records

In [60]:
# record_num = 345
# numU = 250
# numE = 20
# for j in range(0, record_num, numU):
#     for i in range(j, min(j+numU, record_num), numE):
#         print(i, min(min(i+numE, record_num),  j+numU))
#         time.sleep(0.1)
#     print("UPSERT********")

    

In [61]:
"""THIS WILL ADD THE DATA TO DB WITHOUT WORNING"""
# Upload records in bulks. (Use this if the len(chunks) is large).
# This funtion take couple of minutes.

# record_num = len(chunks)
# numU = 250
# numE = 20
# for j in range(0, record_num, numU):
#     vectors = []
#     for i in range(j, min(j+numU, record_num), numE):
#         rec = create_rec(i, min(min(i+numE, record_num), j+numU))
#         time.sleep(1)
#         vectors += rec
#     index.upsert(
#     vectors=vectors,
#     namespace="doc1"
#     )

'THIS WILL ADD THE DATA TO DB WITHOUT WORNING'

In [62]:
# Input the query and Embedd it.
query = "who is anu chaudhary"
query_embedding = pc.inference.embed(
    model="llama-text-embed-v2",
    # model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

# Search in the VectorDB and display result
results = index.query(
    namespace="doc1",
    vector=query_embedding[0].values,
    top_k=4,
    include_values=False,
    include_metadata=True
)

In [63]:
results["matches"][0]['metadata']['source_text']

'more than 20 research papers in both international and national journals and conferences. He holds 6 patents at the national level. His academic engagement includes guiding Ph.D. scholars, as well as supervising numerous M.Tech, MCA, and B.Tech projects. Dr. Chaudhary current area of research is in the field of High Speed Data Networks, Machine Learning, Data Science and related domains. He\'s an active participant in the academic community, serving on editorial and reviewer boards of various national and international journals. He has also taken on leadership roles by chairing multiple national and international conferences. Dr. Chaudhary is a member of several technical societies, including the Institution of Electronics and Telecommunication Engineers (IETE) and the Computer Society of India (CSI). Prof (Dr.) Anu Chaudhary Ph.D (Computer Science), M.Tech (Computer Science), MCA 8527976204 (M) hodcse@akgec.ac.in, chaudharyanu@akgec.ac.in, getanuchaudhary@yahoo.com, dr.anuchaudhary@g

In [64]:
results["matches"][1]['metadata']['source_text']

'community and with society at large, such as, being able to comprehend and write effective reports and design documentation, make effective presentations, and give and receive clear instructions. PO 11. Project management and finance :- Demonstrate knowledge and understanding of the engineering and management principles and apply these to one’s own work, as a member and leader in a team, to manage projects and in multidisciplinary environments. PO 12. Life-long learning :- Recognize the need for, and have the preparation and ability to engage in independent and life-long learning in the broadest context of technological changes in the field of Computer Science. Program Specific Outcomes (PSOs) w.e.f. Session 2016-17 PSO 1. Ability to exhibit analytical & logical skills and apply knowledge of Maths and Computer Science to design, develop, test and maintenance of software solutions. PSO 2. Ability to identify, formulate and resolve real life/social problems by using current computer tec

In [65]:
results["matches"][2]['metadata']['source_text']

'content: CSE Faculty » CSE Faculty Home Faculty Labs Achievements Society Departmental Activities Profile Link Dr. Anu Chaudhary Professor & HOD Ph.D Profile Link Dr. Shashank Sahu Professor Ph.D Profile Link Dr. Rajesh Prasad Professor Ph.D Profile Link Dr. Avdhesh Gupta Professor Ph.D Profile Link Dr. Ayushi Prakash Professor Ph.D Profile Link Dr. Sonam Gupta Professor Ph.D Profile Link Dr. Inderjeet Kaur Professor Ph.D Profile Link Dr. Akhilesh Verma Professor Ph.D Profile Link Dr. Santosh Kumar Associate Professor Ph.D Profile Link Dr. Anuradha Associate Professor Ph.D Profile Link Dr. Jaishree Jain Associate Professor Ph.D Profile Link Dr. Nishant Kumar Pathak Associate Professor Ph.D Profile Link Dr. Pawan Associate Professor Ph.D Profile Link Dr. Ashish Dixit Associate Professor Ph.D Profile Link Dr. Shiva Tyagi Associate Professor M.Tech, Ph.D Profile Link Mr. B.N Pandey Assistant Professor M.Tech, Ph.D* Profile Link Mr. Pradeep Gupta Assistant Professor M.Tech, Ph.D* Profile 

In [66]:
results["matches"][3]['metadata']['source_text']

'Participated Anushree Maurya 24 War of Bands Ist JSSATE 30-Mar-17 Palak Srivastava 25 Ist Bhumika Lohani 26 Duet singing Participated KNMIT, Modinagar 7-8th November, 2016 Bhumika Lohani 27 Group Singing Participated KNMIT, Modinagar 7-8th November, 2016 Palak Srivastava 28 Play/Skit Participated KNMIT, Modinagar 7-8th November, 2016 Harsh Shukla 29 Participated Sumit Gupta 30 Participated Ekansh Pandey 31 Participated Mohd. Zaid 32 Participated Shubham Verma 33 Fashion show Participated KNMIT, Modinagar 7-8th November, 2016 Apoorva Singh 34 Rendezvous Participated IIT Delhi 22-Oct-16 Palak Srivastava 35 Participated Bhumika Lohani 36 37 Genero’16 Participated ABES Engineering College 19th Oct 2016 Palak Srivastava Participated Bhumika Lohani 38 Volleyball Participated Maharaja Agrasen 18-19th Oct 2016 Rahul 39 Coding Participated KIET, Ghaziabad 16-18th Sept. 2016 Akshay Sachan 40 Business Quiz Participated KIET, Ghaziabad 16-18th Sept. 2016 Rohan Singh 41 Robo Race Ist KIET, Ghaziab