In [1]:
from langchain.document_loaders import PyPDFLoader
import langchain
from langchain.document_loaders import ReadTheDocsLoader, TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

In [3]:
import pickle

In [6]:
## function to ingest pdf, txt or directory of pdfs into the vector database
def ingest_pdfs(location, dbname):
    """Get documents from web pages."""
    loader = PyPDFLoader('./dataset/'+location)
    raw_documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
    )
    documents = text_splitter.split_documents(raw_documents)
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(documents, embeddings)

    # Save vectorstore
    with open("./vs_test/" + dbname + ".pkl", "wb") as f:
        pickle.dump(vectorstore, f)
        
def ingest_txts(location, dbname):
    """Get documents from web pages."""
    loader = TextLoader('./dataset/'+location)
    raw_documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
    )
    documents = text_splitter.split_documents(raw_documents)
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(documents, embeddings)

    # Save vectorstore
    with open("./vs_test/" + dbname + ".pkl", "wb") as f:
        pickle.dump(vectorstore, f)

def ingest_dir(location, dbname):
    """Get documents from folders."""
    loader = DirectoryLoader('./dataset/'+location, glob='**/*.pdf', show_progress=True, loader_cls=PyPDFLoader)
    raw_documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
    )
    documents = text_splitter.split_documents(raw_documents)
    for i in range(len(documents)):
        documents[i].page_content = "unique_file_name:" + documents[i].metadata['source']+"\n\n" +documents[i].page_content
    # print(documents[1])
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(documents, embeddings)

    # Save vectorstore
    with open("./vs/" + dbname + ".pkl", "wb") as f:
        pickle.dump(vectorstore, f)

In [7]:
## ingest the pdf file into the vector store
ingest_pdfs('FI21052700_00002_19270122.pdf', 'pdf')

In [5]:
## read the vector db to memory
with open("../vs_test/miamilife_pdf_2k.pkl", "rb") as f:
        global pdf_vs
        pdf_vs = pickle.load(f)

In [10]:
## search a question
results = pdf_vs.similarity_search_with_score('Please list related responses from the society during the time of prohibition in Miami area',5)

In [11]:
results

[(Document(page_content='But wise barkeeps, who\'ve invested untold thousands inthe demand is because of\ntheHialeah police department bars and licenses and governmental liquor in thelast year or\non and find\nofb sinsfuantiesto, have anayzed the questionsw have to voteinvariably that they\'ll be virtually run out -owner of the new Paddockbuying such copiousto sugar hot coffee part f o f\'g\nMiami Beach,afoul ofBar, on Washington, at 7th rent drunks who run\nthe law.with a "Yes" vote.\nWhoever worded the proposals must have been paid off\na few days ago. The 11 a. m. story re-\nvealing minor details of the mysteriouspIenty\nIneffect, we are simply to revert to conditions as theyDo you remember Sports Edi-\ntor Jack Bell, then with the\nHerald, writing about the diffi-\ncult time Dale Gardner, acting\nas second and suffering from\nlocomotor ataxia, had walking up\nand down the arena steps at\nMadiwom Square Garden?\nHarry Graham, the noted fight\nreferee, furnishes a good wind-\nup.\n"D

In [9]:
len(results)

5

In [6]:
## search a specific phrase
pdf_vs.similarity_search_with_score('miami beach garden theatre',5)

[(Document(page_content='SPOITN G PAn REAC- A i te UNDER NhEWA MANAGEMENTSP L I G RONEY PLAZA SotwietrTHnc ater dimn ~\nAmerica\'s Foremost T EA G A E _ WITH\nVIOLINIST TEA G Chester Alexander\nAT Saturday Afternoon, Jan. 1 a\nMrami4Beachntrom60Ja. t 14i aP.Miami Beach Fro 4:0Utl6PM Miamni\'s Sidesplitting Comedian, formerly of Jungle Inn\n-Garden Theatre c teday, o Thauersd te\nGarden Tet nhe fora n h , . BILLY PEEL RHODA FREED\nAlton Road and 41st St. DANCE MUSIC BY (Basso of th, Deep C\'s) (Mistress of Blues)\nAll Good Seats ....75c to $2.50 \' Armellini\'s Roney\nRESERVATIONS: Plaza Orchestra .\nMiami Ticket OConee 231 E iager So. e frh oinace P(re. 7 I 0The Lindbergh of Hoofers)Phone, 32075 mi WirClAND\nMiami Beach Bon Office 2119 M. B. - . e\nBnanen tenor Miami. 23i E. Fliger St. Wam. G. McMeehin\nPeonce do Leon Hotel) for Conernt - Manager 33 N.E. 2ND AV- ! FST. Frank Madden and His Orchestra\nAll Next Week, "RAIN"', metadata={'source': 'dataset\\miamilife_pdf\\FI21052703_00002.

In [17]:
## create a retriever from vector db and make sure the relavance score is larget than 0.2
ret = pdf_vs.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={'score_threshold': 0.2, 'k': 15}
)
## create a ConversationalRetrievalChain to query from ret
chain = ConversationalRetrievalChain.from_llm(ChatOpenAI(model="gpt-4", temperature=0.7), 
                                                  retriever=ret,
                                                  return_source_documents=True,)

In [18]:
## try query the ConversationalRetrievalChain, it will return the answer plus the relavant documents
chat_history = []
query = "Please list related responses from the society during the time of prohibition in Miami area"
result = chain({"question": query, "chat_history": chat_history})

In [19]:
result

{'question': 'Please list related responses from the society during the time of prohibition in Miami area',
 'chat_history': [],
 'answer': '1. Efforts to Enforce Prohibition: The local authorities, including the Miami police department, invested significant efforts to enforce prohibition laws. They issued licenses and conducted inspections at bars to ensure compliance with the laws.\n\n2. Public Sentiment: Among the public, there seems to have been a mix of opinions about prohibition. Some people were supportive of the laws, as indicated by the formation of the "Greater Miami Council of Churches" and the "Laymen’s Committee of 1,000". These groups held public meetings to discuss issues such as gambling and the breakdown of law enforcement, which they saw as linked to alcohol consumption.\n\n3. Economic Impact: Prohibition appears to have had a significant economic impact. Bars and liquor stores were forced to close or face legal consequences, which may have led to job losses. At the s

In [32]:
ret.get_relevant_documents('what does miami beach garden theatre perform')

[Document(page_content='General  Admission  50c Cents\nMusic  by Frank  Novak*s  Chicago-Miami\nOrchestra\nLADIES  FREE\nMONDAY,  WEDNESDAY  and FRIDAY  NIGHTS\nHow to get there —From Miami  follow  the Dixie to 115th St. and \nturn left at big sign. Busses  leave Central  school,  N. E. Third  \nstreet,  direct  to track.\nFrom Miami  Beach:\nDe LUXE  BUS SERVICE  direct  to the track. Busses  leave cor\xad\nner of Fifth street and Washington  avenue  at 7:10 p. m.; Wash \xad\nington  avenue  and Espanola  way at 7:20.\nFREE  PARKINGPhone  Miami  Beach  2119 for Reservations\nMIAMI  BEACH  GARDEN  THEATRE\nAlton Road at 41st Street  \nNear Nautilus  Hotel\nDrive  from  Miami  over either  causeway,  turn left at Alton  Rd.\n50 cents to $1.65  Seats on sale Phone M. B. 2119\nCurtain  at 8:30 P. M.\n500 SEATS  AT FIFTY  CENTS\nWILD TO MODERN\nwild nature  to a modern  city \ndependable  Electric  service.The span from\nIs bridged  by\nPeople,  business,  trade and industry  naturally  g

In [11]:
## search a general word
pdf_vs.similarity_search_with_score('theatre',5)

[(Document(page_content='OLYMPIA  THEATRE —Magnificent  in every detail. Fine sym\xad\nphony orchestra  and Sanley  Malotte  at the big wind organ. One of \nour clever  columnists  was sorta harsh  with this boy Malotte  last week  \nand personally  we can’t grow hysterical  over him either but he stops  \nthe show at every performance  and that, of course,  is really what  \ncounts.to\nCAS’A GRANDE —Up on the Dixie. Old timer Allison  aided and \nabetted  by Derby Jimmie  Hodges  serving  a salad of jazz, fun \nand dance.\nMERRICK ’S COUNTRY  CLUB —A perfect  spot in the heart of \nthe Gables. The Jovial  Jan Garber  and his crew of troubadours  fur\xad\nnish sweet music  amid gorgeous  tropical  surroundings.  Biltmore  serv \xad\nice and atmosphere.\nHOTEL  ANTILLA —Where  society  gathers  for their bridge and \ngossips  to the "Harmony ” of Joe Astoria\'s  minstrels.\nRONEY  PLAZA  POOLS —Thanks,  Newt, you sure fixed the old \nswimmin ’ hole up mighty  fine.\nCAPITOL  THEATRE —Me