In [None]:
#upload docs and query
#upload audio file and query
#provide url and query page

In [5]:
from langchain.llms import GooglePalm
llm = GooglePalm()

In [2]:
# test llm
prompts = ["what does the fox say?",'who is humpty dumpty?'] # according to the class prmpts must be in list
llm_result = llm._generate(prompts)

print(llm_result.generations[0][0].text)
print(llm_result.generations[1][0].text)

Ring-ding-ding-ding-ding-ding

Wa-pa-pa-pa-pa-pa-pa

Mow-mow-mow-mow-mow-mow

Nana-na-na-na-na-na

Eh-eh-eh-eh-eh-eh

Ting-a-ling-a-ling-a-ling-a-ling
a character in a poem


In [3]:
# import langchain dir loader from document loaders
from langchain.document_loaders import DirectoryLoader

# directory path
directory = 'data'
# function to load the text docs
def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)

0

In [None]:
# use text splitter to split text in chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter #This text splitter is the recommended one for generic text. It tries to split on them in order until the chunks are small enough

# split the docs into chunks using recursive character splitter
def split_docs(documents,chunk_size=1000,chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

# store the split documnets in docs variable
docs = split_docs(documents)

## Interacting With a Single PDF

In [5]:
# convert our PDF into a document.
from langchain.document_loaders import PyPDFLoader
pdf_loader = PyPDFLoader('./data/RachelGreenCV.pdf')
documents = pdf_loader.load() #This returns a list of Document’s, one Document for each page of the pdf.

In [6]:
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm=llm)
query = 'Who is the CV about?'
response = chain.run(input_documents=documents, question=query)
print(response)

Rachel Green


## Interacting With a Single PDF Using Embeddings

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain.vectorstores import Chroma
from langchain.embeddings import GooglePalmEmbeddings

In [8]:
# load the document as before
loader = PyPDFLoader('./data/RachelGreenCV.pdf')
documents = loader.load()

In [9]:
# we split the data into chunks of 1,000 characters, with an overlap
# of 200 characters between the chunks, which helps to give better results
# and contain the context of the information between chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(documents)

In [14]:
# we create our vectorDB, using the OpenAIEmbeddings tranformer to create
# embeddings from our text chunks. We set all the db information to be stored
# inside the ./data directory, so it doesn't clutter up our source files
vectordb = Chroma.from_documents(
  documents,
  embedding=GooglePalmEmbeddings(),
)

In [22]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(search_kwargs={'k': 7}),
    return_source_documents=True
)

# we can now execute queries against our Q&A chain
result = qa_chain({'query': 'Who is the CV about?'})
print(result['result'])

The CV is about a person named "Green, R.".


### Adding Chat History

In [16]:
# from langchain.chains import ConversationalRetrievalChain
# qa_chain = ConversationalRetrievalChain.from_llm(
#     llm,
#     vectordb.as_retriever(search_kwargs={'k': 6}),
#     return_source_documents=True
# )

In [None]:
# for each question and answer, we will build up a list called chat_history , which we will pass back into the chain run command each time.
# import sys

# chat_history = []
# while True:
#     # this prints to the terminal, and waits to accept an input from the user
#     query = input('Prompt: ')
#     # give us a way to exit the script
#     if query == "exit" or query == "quit" or query == "q":
#         print('Exiting')
#         sys.exit()
#     # we pass in the query to the LLM, and print out the response. As well as
#     # our query, the context of semantically relevant information from our
#     # vector store will be passed in, as well as list of our chat history
#     result = qa_chain({'question': query, 'chat_history': chat_history})
#     print('Answer: ' + result['answer'])
#     # we build up the chat_history list, based on our question and response
#     # from the LLM, and the script then returns to the start of the loop
#     # and is again ready to accept user input.
#     chat_history.append((query, result['answer']))

## Interacting With Multiple Documents

In [24]:
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import TextLoader
import os

In [26]:
documents = []
for file in os.listdir('data'):
    if file.endswith('.pdf'):
        pdf_path = './data/' + file
        loader = PyPDFLoader(pdf_path)
        documents.extend(loader.load())
    elif file.endswith('.docx') or file.endswith('.doc'):
        doc_path = './data/' + file
        loader = Docx2txtLoader(doc_path)
        documents.extend(loader.load())
    elif file.endswith('.txt'):
        text_path = './data/' + file
        loader = TextLoader(text_path)
        documents.extend(loader.load())

In [27]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
chunked_documents = text_splitter.split_documents(documents)

In [28]:
vectordb = Chroma.from_documents(
  chunked_documents,
  embedding=GooglePalmEmbeddings(),
)

In [31]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(search_kwargs={'k': 7}),
    return_source_documents=True
)

# we can now execute queries against our Q&A chain
result = qa_chain({'query': 'how many cvs are there?'})
print(result['result'])

1


### Read All Docs 

In [1]:
# import langchain dir loader from document loaders
from langchain.document_loaders import DirectoryLoader

# directory path
directory = 'data/'

# function to load the text docs
def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)

3

In [2]:
# use text splitter to split text in chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter #This text splitter is the recommended one for generic text. It tries to split on them in order until the chunks are small enough

# split the docs into chunks using recursive character splitter
def split_docs(documents,chunk_size=1000,chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

# store the split documnets in docs variable
docs = split_docs(documents)

In [3]:
# embeddings using langchain
from langchain.embeddings import SentenceTransformerEmbeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# using chromadb as a vector store and storing the docs in it
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embeddings)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Using q&a chain to get the answer for our query
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff",verbose=True)

In [10]:
# write your query and perform similarity search to generate an answer
query = "who has the highest job experience?"
matching_docs = db.similarity_search(query)
answer =  chain.run(input_documents=matching_docs, question=query)
answer



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Robert Roberts, Professor of English University of Illinois at Urbana-Champaign (217) 333-0203, rrobe3@illinois.edu

Sally Briscoe, Assoc. Professor of English Butler University, Indianapolis, IN (317) 492-8763, briscoe@butler.edu

Rachel Green, page 3 of 3

5

grad.illinois.edu/CareerDevelopment

20xx-20xx

Rachel Green, page 2 of 3

4

grad.illinois.edu/CareerDevelopment

PROFESSIONAL SERVICE Managing Editor Southern Literary Journal   Oversee production and publication procedures.  Maintain editorial correspondence with prospective contributors. 

Process manuscripts submitted for publication

Conduct business transactions including publicity, subscriptions and advertising.

20xx-prese

'Robert Roberts'