In [1]:
from langchain.document_loaders import PyPDFLoader
import langchain
from langchain.document_loaders import ReadTheDocsLoader, TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

In [3]:
import pickle

In [6]:
## function to ingest pdf, txt or directory of pdfs into the vector database
def ingest_pdfs(location, dbname):
    """Get documents from web pages."""
    loader = PyPDFLoader('./dataset/'+location)
    raw_documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
    )
    documents = text_splitter.split_documents(raw_documents)
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(documents, embeddings)

    # Save vectorstore
    with open("./vs_test/" + dbname + ".pkl", "wb") as f:
        pickle.dump(vectorstore, f)
        
def ingest_txts(location, dbname):
    """Get documents from web pages."""
    loader = TextLoader('./dataset/'+location)
    raw_documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
    )
    documents = text_splitter.split_documents(raw_documents)
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(documents, embeddings)

    # Save vectorstore
    with open("./vs_test/" + dbname + ".pkl", "wb") as f:
        pickle.dump(vectorstore, f)

def ingest_dir(location, dbname):
    """Get documents from folders."""
    loader = DirectoryLoader('./dataset/'+location, glob='**/*.pdf', show_progress=True, loader_cls=PyPDFLoader)
    raw_documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
    )
    documents = text_splitter.split_documents(raw_documents)
    for i in range(len(documents)):
        documents[i].page_content = "unique_file_name:" + documents[i].metadata['source']+"\n\n" +documents[i].page_content
    # print(documents[1])
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(documents, embeddings)

    # Save vectorstore
    with open("./vs/" + dbname + ".pkl", "wb") as f:
        pickle.dump(vectorstore, f)

In [7]:
## ingest the pdf file into the vector store
ingest_pdfs('FI21052700_00002_19270122.pdf', 'pdf')

In [8]:
## read the vector db to memory
with open("./vs_test/pdf.pkl", "rb") as f:
        global pdf_vs
        pdf_vs = pickle.load(f)

In [15]:
## search a question
pdf_vs.similarity_search_with_score('what does miami beach garden theatre perform',5)

<coroutine object VectorStore.asimilarity_search at 0x000001C7ACB2EE40>

In [10]:
## search a specific phrase
pdf_vs.similarity_search_with_score('miami beach garden theatre',5)

[(Document(page_content='General  Admission  50c Cents\nMusic  by Frank  Novak*s  Chicago-Miami\nOrchestra\nLADIES  FREE\nMONDAY,  WEDNESDAY  and FRIDAY  NIGHTS\nHow to get there —From Miami  follow  the Dixie to 115th St. and \nturn left at big sign. Busses  leave Central  school,  N. E. Third  \nstreet,  direct  to track.\nFrom Miami  Beach:\nDe LUXE  BUS SERVICE  direct  to the track. Busses  leave cor\xad\nner of Fifth street and Washington  avenue  at 7:10 p. m.; Wash \xad\nington  avenue  and Espanola  way at 7:20.\nFREE  PARKINGPhone  Miami  Beach  2119 for Reservations\nMIAMI  BEACH  GARDEN  THEATRE\nAlton Road at 41st Street  \nNear Nautilus  Hotel\nDrive  from  Miami  over either  causeway,  turn left at Alton  Rd.\n50 cents to $1.65  Seats on sale Phone M. B. 2119\nCurtain  at 8:30 P. M.\n500 SEATS  AT FIFTY  CENTS\nWILD TO MODERN\nwild nature  to a modern  city \ndependable  Electric  service.The span from\nIs bridged  by\nPeople,  business,  trade and industry  naturally  

In [64]:
## create a retriever from vector db and make sure the relavance score is larget than 0.2
ret = pdf_vs.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={'score_threshold': 0.2}
)
## create a ConversationalRetrievalChain to query from ret
chain = ConversationalRetrievalChain.from_llm(ChatOpenAI(model="gpt-4", temperature=0.7), 
                                                  retriever=ret,
                                                  return_source_documents=True,)

In [67]:
## try query the ConversationalRetrievalChain, it will return the answer plus the relavant documents
chat_history = []
query = "miami beach garden theatre"
result = chain({"question": query, "chat_history": chat_history})

In [68]:
result

{'question': 'miami beach garden theatre',
 'chat_history': [],
 'answer': 'The Miami Beach Garden Theatre is located at Alton Road at 41st Street, near the Nautilus Hotel. You can drive from Miami over either causeway and turn left at Alton Rd. It offers 500 seats at fifty cents each. The curtain time is at 8:30 P.M. You can make reservations by calling M. B. 2119. Free parking is also available.',
 'source_documents': [Document(page_content='General  Admission  50c Cents\nMusic  by Frank  Novak*s  Chicago-Miami\nOrchestra\nLADIES  FREE\nMONDAY,  WEDNESDAY  and FRIDAY  NIGHTS\nHow to get there —From Miami  follow  the Dixie to 115th St. and \nturn left at big sign. Busses  leave Central  school,  N. E. Third  \nstreet,  direct  to track.\nFrom Miami  Beach:\nDe LUXE  BUS SERVICE  direct  to the track. Busses  leave cor\xad\nner of Fifth street and Washington  avenue  at 7:10 p. m.; Wash \xad\nington  avenue  and Espanola  way at 7:20.\nFREE  PARKINGPhone  Miami  Beach  2119 for Reserv

In [32]:
ret.get_relevant_documents('what does miami beach garden theatre perform')

[Document(page_content='General  Admission  50c Cents\nMusic  by Frank  Novak*s  Chicago-Miami\nOrchestra\nLADIES  FREE\nMONDAY,  WEDNESDAY  and FRIDAY  NIGHTS\nHow to get there —From Miami  follow  the Dixie to 115th St. and \nturn left at big sign. Busses  leave Central  school,  N. E. Third  \nstreet,  direct  to track.\nFrom Miami  Beach:\nDe LUXE  BUS SERVICE  direct  to the track. Busses  leave cor\xad\nner of Fifth street and Washington  avenue  at 7:10 p. m.; Wash \xad\nington  avenue  and Espanola  way at 7:20.\nFREE  PARKINGPhone  Miami  Beach  2119 for Reservations\nMIAMI  BEACH  GARDEN  THEATRE\nAlton Road at 41st Street  \nNear Nautilus  Hotel\nDrive  from  Miami  over either  causeway,  turn left at Alton  Rd.\n50 cents to $1.65  Seats on sale Phone M. B. 2119\nCurtain  at 8:30 P. M.\n500 SEATS  AT FIFTY  CENTS\nWILD TO MODERN\nwild nature  to a modern  city \ndependable  Electric  service.The span from\nIs bridged  by\nPeople,  business,  trade and industry  naturally  g

In [11]:
## search a general word
pdf_vs.similarity_search_with_score('theatre',5)

[(Document(page_content='OLYMPIA  THEATRE —Magnificent  in every detail. Fine sym\xad\nphony orchestra  and Sanley  Malotte  at the big wind organ. One of \nour clever  columnists  was sorta harsh  with this boy Malotte  last week  \nand personally  we can’t grow hysterical  over him either but he stops  \nthe show at every performance  and that, of course,  is really what  \ncounts.to\nCAS’A GRANDE —Up on the Dixie. Old timer Allison  aided and \nabetted  by Derby Jimmie  Hodges  serving  a salad of jazz, fun \nand dance.\nMERRICK ’S COUNTRY  CLUB —A perfect  spot in the heart of \nthe Gables. The Jovial  Jan Garber  and his crew of troubadours  fur\xad\nnish sweet music  amid gorgeous  tropical  surroundings.  Biltmore  serv \xad\nice and atmosphere.\nHOTEL  ANTILLA —Where  society  gathers  for their bridge and \ngossips  to the "Harmony ” of Joe Astoria\'s  minstrels.\nRONEY  PLAZA  POOLS —Thanks,  Newt, you sure fixed the old \nswimmin ’ hole up mighty  fine.\nCAPITOL  THEATRE —Me