In [49]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

In [50]:
from dotenv import load_dotenv

load_dotenv()

True

In [51]:
import os

In [52]:
##Lets read the document
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents

In [53]:
doc=read_doc('documents/')
doc

[Document(metadata={'source': 'documents/VishalGautamCV.pdf', 'page': 0}, page_content=" \nVishal Gautam \nFull – Stack Software Engineer \nDynamic and results-driven software engineer with a focus on full-stack development, data solutions, and machine learning. Aiming to leverage \nextensive experience in the banking sector to drive innovative technology solutions. \n \n   vishalgautam.tech@gmail.com     +91 9110091875 \n https://github.com/ThatNinjaGuy         https://www.linkedin.com/in/vishal-gautam-17b873108  \n \n \n CORE COMPETENCIES \n \n SOFT SKILLS \n \n \n \n EDUCATION \n B.Tech., Electronics and Communications, Delhi \nTechnological University (Formerly DCE), 2018 \nMinor Project: Android Application for \nMicrocontroller Programming \nMajor Project: Android Application for Creating \nApps Without Code \n TECHNICAL SKILLS \n Back-end Technologies: Java, Spring Boot, Spring \nBatch, Spring Reactive, Hibernate, REST API, \nPython \n Front-end Technologies: ReactJS, Next.js, \

In [54]:
# Divide the docs into chunks

def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(docs)
    return chunks

In [55]:
document = chunk_data(docs=doc)
document
# len(document)


[Document(metadata={'source': 'documents/VishalGautamCV.pdf', 'page': 0}, page_content='Vishal Gautam \nFull – Stack Software Engineer \nDynamic and results-driven software engineer with a focus on full-stack development, data solutions, and machine learning. Aiming to leverage \nextensive experience in the banking sector to drive innovative technology solutions. \n \n   vishalgautam.tech@gmail.com     +91 9110091875 \n https://github.com/ThatNinjaGuy         https://www.linkedin.com/in/vishal-gautam-17b873108  \n \n \n CORE COMPETENCIES \n \n SOFT SKILLS \n \n \n \n EDUCATION \n B.Tech., Electronics and Communications, Delhi \nTechnological University (Formerly DCE), 2018 \nMinor Project: Android Application for \nMicrocontroller Programming \nMajor Project: Android Application for Creating \nApps Without Code \n TECHNICAL SKILLS \n Back-end Technologies: Java, Spring Boot, Spring'),
 Document(metadata={'source': 'documents/VishalGautamCV.pdf', 'page': 0}, page_content='Batch, Spring 

In [64]:
# Embedding technique with OpenAI
embeddings = OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])

In [57]:
vectors=embeddings.embed_query("How are you?")
vectors
# len(vectors)
# 1536

[-0.016759382679243706,
 -0.012075887146964084,
 0.006688943180011325,
 -0.025965934802740547,
 -0.01615466188875786,
 0.017610926305488096,
 -0.01114412498151107,
 -0.009940855275284953,
 -0.01817862329808113,
 -0.010409821829722957,
 0.027841801020492562,
 0.001653724012471747,
 -0.0074170753883764425,
 -0.011613091535949076,
 0.007213445431288549,
 -0.015303117331190889,
 0.02836013232834679,
 -0.011872257189876188,
 0.014007289061555319,
 -0.020609842758935026,
 0.002516066906121701,
 0.006355730395959874,
 0.0009880692302201067,
 -0.008213084249104193,
 -0.015907837190354157,
 -0.007713264840196371,
 0.025015659807018545,
 -0.012372076598784011,
 0.02227590807617615,
 -0.025114391176496158,
 0.005606000816936851,
 0.007700923419011669,
 -0.013131061661618947,
 0.004029409522716261,
 0.008805463152744048,
 -0.022325273760914956,
 0.004038665472189465,
 -0.01043450467209236,
 0.020313654238437676,
 -0.006337218497013467,
 0.027027279329495824,
 0.0012595761889165997,
 -0.00528204398

In [59]:
# Vector search db in pinecone
index_name = 'job-question-responder'
vector_store = PineconeVectorStore(index_name=index_name, embedding=embeddings)
index = vector_store.add_documents(document)
index


['b606c235-d8a0-4d26-9a49-cbc09e233f87',
 '36b8eb05-18d0-4ad2-a307-44c35a549aa0',
 'ad506bec-22b6-4b35-a044-93c9988edcfc',
 '04d24ca5-8b1a-4720-b4d7-bc765d31d4b3',
 '98d08d63-24de-4cee-914b-356b5d80154d',
 '13197441-bd87-4276-bcfa-5e7ecf13478f',
 '3cf72e03-0d40-4925-a726-80182869d366',
 'a4701fc3-6dfa-402a-a4da-b6203882eeaf',
 '004e357c-86c5-4e5f-8c9c-2a065c8a8875',
 '712eb260-2872-4480-9d54-010a1ae6624d',
 '3463e65d-167d-4a9d-9fcd-16fe9fd4b33d',
 'e49b0315-be31-4000-a5c8-eaf93b8c0e06',
 '2e609ef8-c595-4ebd-a94e-37eed6f4b69b',
 'a6f7d907-732a-40b6-aca9-49eca4104090']

In [60]:
# Retrieve Cosine similarity results
def retrieve_query(query, k=2):
    results = vector_store.similarity_search(query, k=k)
    return results

In [61]:
# Search answers from vector database
def retrieve_answers(query):
    doc_search = retrieve_query(query)
    return doc_search

In [63]:
our_query = "Hazelcast cache"
answer = retrieve_answers(our_query)
print(answer)

[Document(id='230a719c-42a0-41ad-93c0-296fa591b846', metadata={'page': 1.0, 'source': 'documents/VishalGautamCV.pdf'}, page_content="Senior Software Engineer                                                                                                                                                                                                  Jan’22 – Dec’23 \n Led the development of a scalable Hazelcast cache solution , optimizing data access with latency ranging from 8-100ms and handling 10,000 \nrequests per second, improving system responsiveness and performance. \n Created and deployed a Marketing Metrics analytic s application, enabling CCB's marketing team to gain actionable insights, leading to a 91% \nincrease in leads, 60% revenue growth, and a 15% improvement in revenue/spend ratio."), Document(id='004e357c-86c5-4e5f-8c9c-2a065c8a8875', metadata={'page': 1.0, 'source': 'documents/VishalGautamCV.pdf'}, page_content="Senior Software Engineer                              