In [6]:
#importing libraries

import openai
import langchain
import pinecone

from langchain.document_loaders import PyPDFDirectoryLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.embeddings.openai import OpenAIEmbeddings 
from pinecone import Pinecone 
from langchain.llms import OpenAI


In [2]:
#loading all environment variables

from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import os

In [4]:
#reading document

def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents
    

In [7]:
doc=read_doc('documents/')
doc

[Document(page_content='Umnoon Binta Ali\nAddress: Sector 18, Uttara, Dhaka\nE-mail: umnoonbintaali@gmail.com ✼Phone: +880 1627 097 553\nLinkedIn: linkedin.com/in/umnoonbintaali/\nPortfolio: https://umnoon.github.io/\nObjective\nHighly motivated Data Scientist with a strong foundation in statistical analysis, machine learning and\nartificial intelligence. Possess experience in designing and implementing data pipelines, building models,\nleveraging cloud platforms and communicating insights to stakeholders.\nWork experience\nAstha.IT (Bangladesh) September 2023 - Current\nData Scientist Dhaka, Bangladesh\n•Developed and implemented PySpark transformations within a drag-and-drop ETL tool, optimizing\ndata processing efficiency for a seamless user experience.\n•Led the fine-tuning of a Keras model in a gaming analytics project, enhancing the accuracy of\nevent detection in real-time gameplay data.\n•Spearheaded the fine-tuning of a Keras model in a gaming analytics project, enhancing the 

In [8]:
len(doc)

4

In [9]:
#creating text chunks

def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitted = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitted.split_documents(docs)
    return doc

In [10]:
documents=chunk_data(doc)
documents

[Document(page_content='Umnoon Binta Ali\nAddress: Sector 18, Uttara, Dhaka\nE-mail: umnoonbintaali@gmail.com ✼Phone: +880 1627 097 553\nLinkedIn: linkedin.com/in/umnoonbintaali/\nPortfolio: https://umnoon.github.io/\nObjective\nHighly motivated Data Scientist with a strong foundation in statistical analysis, machine learning and\nartificial intelligence. Possess experience in designing and implementing data pipelines, building models,\nleveraging cloud platforms and communicating insights to stakeholders.\nWork experience\nAstha.IT (Bangladesh) September 2023 - Current\nData Scientist Dhaka, Bangladesh\n•Developed and implemented PySpark transformations within a drag-and-drop ETL tool, optimizing\ndata processing efficiency for a seamless user experience.', metadata={'source': 'documents\\resume_of_Umnoon_Binta_Ali.pdf', 'page': 0}),
 Document(page_content='•Led the fine-tuning of a Keras model in a gaming analytics project, enhancing the accuracy of\nevent detection in real-time game

In [11]:
len(documents)

12

In [24]:
#embeddings with openai
os.environ["OPENAI_API_KEY"] = "Your_key"
embeddings=OpenAIEmbeddings()


In [13]:
vectors=embeddings.embed_query("How are you?")
vectors

[-0.016826705647666553,
 -0.012123129071598045,
 0.006685004426783297,
 -0.025999297763816813,
 -0.01623412932048443,
 0.01769087984619154,
 -0.011116983131508934,
 -0.00993800328303738,
 -0.01819703875352116,
 -0.010437989384474315,
 0.0278510985534303,
 0.0016558194412100663,
 -0.007320789929552944,
 -0.0116478332624093,
 0.0072220268979150725,
 -0.015382300733744815,
 0.028418985519686784,
 -0.011845359325685043,
 0.013974930792534087,
 -0.02059203528582039,
 0.0024999323698297834,
 0.0063640255052827685,
 0.0010200341712710583,
 -0.008246690790020965,
 -0.01585142280571832,
 -0.007765222174939534,
 0.025122777953506453,
 -0.012425590041081791,
 0.022271003098373987,
 -0.02519684976157358,
 0.0056418230483219,
 0.007752877028815439,
 -0.013172483721613405,
 0.004083223088030575,
 0.00882074963084758,
 -0.022320383682870367,
 0.004046186718335735,
 -0.010481198094400564,
 0.02032043927712263,
 -0.0063516803591586735,
 0.027085687386543184,
 0.0012870022053637744,
 -0.0052622034021634

In [14]:
len(vectors)

1536

In [15]:
#vector search db in pinecone

os.environ['PINECONE_API_KEY'] = 'your_key'
index_name="langchainvector"

from langchain_pinecone import PineconeVectorStore
index = PineconeVectorStore.from_documents(doc, index_name=index_name,embedding=embeddings)

In [16]:
#applying cosine similarity retrieve results from vector db
def retrieve_query(query, k=2):
    matching_results=index.similarity_search(query, k=k)
    return matching_results
    

In [20]:
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI

llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0.5)
chain = load_qa_chain(llm, chain_type="stuff") 

In [25]:
#search answers from vector db

def retrieve_answers(query):
    doc_search=retrieve_query(query)
    # print(doc_search)
    response=chain.run(input_documents=doc_search, question=query)
    return response

In [26]:
query = "which university did the candidate attended?"
answer = retrieve_answers(query)
print(f"Answer: {answer}")

Answer:  North South University


In [27]:
query = "Where do the candidate work at?"
answer = retrieve_answers(query)
print(f"Answer: {answer}")

Answer:  The candidate works at Astha.IT in Dhaka, Bangladesh.


In [29]:
query = "Which domain is the candidate expert in?"
answer = retrieve_answers(query)
print(f"Answer: {answer}")

Answer:  The candidate is an expert in data science, machine learning, artificial intelligence, linear algebra, and programming languages such as Python and R. They also have proficiency in English and are a native speaker of Bengali.
