In [5]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import time
import streamlit as st
import os
from dotenv import load_dotenv
load_dotenv()


True

In [6]:

pc = Pinecone(api_key=os.getenv('PC_API_KEY'))

In [7]:
# Read the data to be uploaded
csv_file="akgec.csv"  # provide the file locaton to be upload 
def read_doc(txt_file):
    loader = CSVLoader(file_path=txt_file,
        csv_args={
        'delimiter': ',',
        'quotechar': '"',
        'fieldnames': ['Link', 'content']
    })
    document = loader.load()
    return document
document = read_doc(csv_file)

# Chunking the document
def chunk_data(docs, size=1800, overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)
    doc = text_splitter.split_documents(docs)
    return doc
chunks = chunk_data(document)
print("Number of chunks:", len(chunks))

# Taking the fact that the average length of a word is 4.7 characters
avg_words = 0
for i in range(len(chunks)):
    word = len(chunks[i].page_content)/4.7
    avg_words += word
print("Number of words:", int(avg_words))
print("average words in a chunk:", avg_words/len(chunks))


Number of chunks: 719
Number of words: 226674
average words in a chunk: 315.26381203207694


### Stats for the namespace- doc1
Number of chunks: 1828

Number of words: 550905

average words in a chunk: 301.37064109129756

In [56]:
# # search the chunks with their size.

# num = 0
# for i in range(len(chunks)):
#     if len(chunks[i].page_content) < 100:
#         num += 1 
#         print(len(chunks[i].page_content), chunks[i].page_content)
# print(num)

In [8]:
# Create the index if not present
index_name = "akgec-data"
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    )

while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

# Make instance of index
index = pc.Index(index_name)

In [58]:
"""DEMO FINTIONS"""

# # Funtion to create embeddings
# ## (end - start) must be smaller than 90. Do not try to enbed more than 90 embeddings at a time. 
# def create_embeddings(start, end):
#     embeddings = pc.inference.embed(
#         model="multilingual-e5-large",
#         inputs=[d.page_content for d in chunks[start:end]],
#         parameters={
#             "input_type": "passage", 
#             "truncate": "END"
#         }
#     )
#     return embeddings

# embeddings = create_embeddings(0, 1)
# print(embeddings)
# print(len(embeddings.data[0]['values']))

# # DEMO funtion that display the structure of data to be upsert to vectorDB.
# def create_doc():
#     doc = []
#     for idx, text in enumerate(chunks):
#         doc.append({"id":str(idx), "text": text.page_content})
#     return doc
# docs = create_doc()
# print(docs[1])

'DEMO FINTIONS'

>## The Create_rec funtion uses two funtion:
#### 1. Creating doc file to make record that has no limit to input size.
#### 2. Embeddings funtion that has limit of only 90 embedding at a time.

In [9]:
# CREATE THE FINAL RECORD TO BE UPLOADED.
## (end - start) must be smaller than 90. Do not try to create more than 90 records at a time due to embedding limits. 

def create_rec(start, end):
    doc = []
    # Creating the document file to be used to create vectorstore
    for idx, text in enumerate(chunks):
        doc.append({"id":str(idx), "text": text.page_content})

    # Funtion to create embeddings
    embeddings = pc.inference.embed(
        model="llama-text-embed-v2",
        # model="multilingual-e5-large",
        inputs=[d.page_content for d in chunks[start:end]],
        parameters={
            "input_type": "passage", 
            "truncate": "END"
        }
    )
    
    # Record to be uploaded.
    records = []
    for d, e in zip(doc[start:end], embeddings):
        records.append({
            "id": d["id"],
            "values": e["values"],
            "metadata": {
                "source_text": d["text"]
            }
        })
    return records

In [60]:
# record_num = 345
# numU = 250
# numE = 20
# for j in range(0, record_num, numU):
#     for i in range(j, min(j+numU, record_num), numE):
#         print(i, min(min(i+numE, record_num),  j+numU))
#         time.sleep(0.1)
#     print("UPSERT********")

    

In [10]:
"""THIS WILL ADD THE DATA TO DB WITHOUT WORNING"""
# Upload records in bulks. (Use this if the len(chunks) is large).
# This funtion take couple of minutes.

record_num = len(chunks)
numU = 250
numE = 20
for j in range(0, record_num, numU):
    vectors = []
    for i in range(j, min(j+numU, record_num), numE):
        rec = create_rec(i, min(min(i+numE, record_num), j+numU))
        time.sleep(1)
        vectors += rec
    index.upsert(
    vectors=vectors,
    namespace="fit-markdown-data"
    )

In [17]:
# Input the query and Embedd it.
query = "who is satvat"
query_embedding = pc.inference.embed(
    model="llama-text-embed-v2",
    # model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

# Search in the VectorDB and display result
results = index.query(
    namespace="fit-markdown-data",
    vector=query_embedding[0].values,
    top_k=4,
    include_values=False,
    include_metadata=True
)

In [18]:
results["matches"][0]['metadata']['source_text']

'with assignments and resources related to the topics covered on a daily basis.\\n**Topics covered in the workshop:** Introduction to Augmented RealityIntroduction to UnityImage targeting in AR using Vuforia and unityBasics about general physics in unityHands on experience on snapchat AR lens studioBasics about meshes,textures and other assets of 3D modelsBasics about game development through unityCreated Snapchat filter using lab studioCreated 2 apps using unity\\n**Instructor:** Saksham Saini,CTO and Co-founder Vorphy.\\n**Accomplishments**\\nSDC received an overwhelming 186 registrations* from the 3rd year students among which 70 lucky students were selected to attend the workshop. SDC-SI successfully conducted the workshop maintaining the audience’s attention and spirit of the event throughout the entire duration of the workshop.\\nThe workshop introduced the participants to upcoming technologies like AR/VR. The participants gained practical knowledge about basic concepts,hands-on 

In [14]:
results["matches"][1]['metadata']['source_text']

'of Electronics and Telecommunication Engineers (IETE) and the Computer Society of India (CSI).\\nProf (Dr.) Anu Chaudhary\\nPh.D (Computer Science),M.Tech (Computer Science),MCA 8527976204 (M) hodcse@akgec.ac.in,chaudharyanu@akgec.ac.in,getanuchaudhary@yahoo.com,dr.anuchaudhary@gmail.com\\n### CSE Faculty\\nFaculty at AKGEC fulfil multiple roles as educators,researchers and professionals having a broad range of backgrounds,scholarly interests and areas of expertise thereby bringing substantial achievement in professional practice and research into the classroom.\\n[gs_team theme=""gs_tm_theme1"" group=""CSE Faculty"" cols=""2""]\\n### Labs\\nPython LanguageProgramming Lab\\nSoftware Engineering Lab\\nC programming Lab\\nOperating System Lab\\nWeb TechnologyLab\\nComputer Network Lab\\nM. Tech. Lab\\nProject Lab\\n### \\n1. The department has adequate number of laboratories as per requirement (strength of students in the department and number of labs / practical courses mentioned in sy

In [15]:
results["matches"][2]['metadata']['source_text']

'| ARUN KUMAR MAURYA | 1768 | 88.40 | I | EN / II  \\n56 | 1602721017 | ANDLEEB KHAN | 1759 | 87.95 | II | EN / II  \\n57 | 1602721012 | AKASH YADAV | 1750 | 87.50 | III | EN / II  \\n58 | 1602713101 | SHREYA TYAGI | 1786 | 89.30 | I | IT / II  \\n59 | 1602713003 | AARUSHI GARG | 1729 | 86.45 | II | IT / II  \\n60 | 1602713100 | SHREYA SINGH | 1728 | 86.40 | III | IT / II  \\n61 | 1602740072 | KAMALDEEP KAUR | 1751 | 87.55 | I | ME / II  \\n62 | 1602740117 | RAHUL KUMAR MAURYA | 1727 | 86.35 | II | ME / II  \\n63 | 1602740169 | UTKARSH DWIVEDI | 1727 | 86.35 | II | ME / II  \\n64 | 1602740059 | DIVYANSHU PALIWAL | 1710 | 85.50 | III | ME / II  \\n65 | 1702700071 | SHIVAM SINGH | 1608 | 89.33 | I | CIVIL  \\n66 | 1702700041 | MOHD FAIZAN | 1601 | 88.94 | II | CIVIL  \\n67 | 1702700050 | PRIYANKA | 1601 | 88.94 | II | CIVIL  \\n68 | 1702700049 | PRAVEEN KUNAL | 1597 | 88.72 | III | CIVIL  \\n69 | 1702710189 | YUKTA CHAUHAN | 1652 | 91.78 | I | CSE  \\n70 | 1702710081 | MANISH SHARMA | 16

In [16]:
results["matches"][3]['metadata']['source_text']

'| ANUJ KUMAR | 1178 | 1400 | 84.14285714 | II | MCA / II  \\n3 | 1702714023 | SHIVANGI GARG | 1151 | 1400 | 82.21428571 | III | MCA / II  \\n1 | 1802714003 | MANSI NIGAM | 1161 | 1400 | 82.92857143 | I | MCA / I  \\n2 | 1802714005 | PREETI JAISWAL | 1124 | 1400 | 80.28571429 | II | MCA / I  \\n3 | 1802714004 | MUKUL | 1108 | 1400 | 79.14285714 | III | MCA / I  \\n### \\n**SL. NO.** | **UNIVERSITY ROLL NO.** | **NAME OF STUDENTS** | **MARKS OBTAINED** | **% MARKS** | **POSITION IN COLLEGE** | **BRANCH / YEAR** | **UNIVERSITY POSITION**  \\n---|---|---|---|---|---|---|---  \\n1 | 1402700094 | SACHIN MAURYA | 4348 | 86.96 | I | CIVIL / IV | 5th Rank in CE Branch at University level.  \\n2 | 1402700108 | SHUBHAM ARORA | 4312 | 86.24 | II | CIVIL / IV | 9th Rank in CE Branch at University level.  \\n3 | 1402700042 | DEEKSHA SINGH | 4300 | 86.00 | III | CIVIL / IV  \\n4 | 1402710107 | PREETI GUPTA | 4472 | 89.44 | I | CSE / IV | 1st Rank in CSE Branch at University level. (I Rank across all