In [138]:
import os
import openai
import sys
import glob
import tiktoken
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

open.api_key = os.environ['OPENAI_API_KEY']

In [298]:
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [179]:
glob.glob("data/*")

['data/silverman-openai-complaint.pdf',
 'data/doc-5.txt',
 'data/The_Effect_of_Student_Teacher_Ratio_on_Truancy.pdf',
 'data/doc-3.txt',
 'data/Question_Generation.pdf',
 'data/doc-2.txt',
 'data/state_of_the_union.txt',
 'data/fec_2016_EDA.v2.pdf',
 'data/doc-4.txt',
 'data/2023-08-01_Trump_Indictment.pdf',
 'data/exploring-ggplot.pdf',
 'data/doc-6.txt',
 'data/doc-1.txt']

In [182]:
loaders = [PyPDFLoader(pdf) for pdf in glob.glob("data/*.pdf")] + \
          [TextLoader (txt) for txt in glob.glob("data/*.txt")]

In [195]:
loaders

[<langchain.document_loaders.pdf.PyPDFLoader at 0x7f973c26c400>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f9730b4ebf0>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f9724e527d0>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f9748357fa0>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f971ad53f40>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f971ad53550>,
 <langchain.document_loaders.text.TextLoader at 0x7f9730b4f850>,
 <langchain.document_loaders.text.TextLoader at 0x7f971ad52c50>,
 <langchain.document_loaders.text.TextLoader at 0x7f971ad52d70>,
 <langchain.document_loaders.text.TextLoader at 0x7f971ad52770>,
 <langchain.document_loaders.text.TextLoader at 0x7f971ad529b0>,
 <langchain.document_loaders.text.TextLoader at 0x7f971ad53100>,
 <langchain.document_loaders.text.TextLoader at 0x7f971ad52fe0>]

In [199]:
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [201]:
len(docs)

123

In [186]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [187]:
splits = text_splitter.split_documents(docs)

In [188]:
len(splits)

257

In [148]:
embedding = OpenAIEmbeddings()

In [216]:
sentence1 = "That movie was great"
sentence2 = "That movie was amazing"
sentence3 = "That film was awesome"
sentence4 = "That movie was garbage"
sentence5 = "Take out the garbage"
sentence6 = "That show was bananas"

In [217]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)
embedding4 = embedding.embed_query(sentence4)
embedding5 = embedding.embed_query(sentence5)
embedding6 = embedding.embed_query(sentence6)

In [218]:
import numpy as np

In [221]:
np.dot(embedding5, embedding6)

0.744506099373989

In [149]:
persist_directory = 'chroma/'

In [222]:
!rm -rf chroma  # remove old database files if any

In [223]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [240]:
print(vectordb._collection.count())

257


In [302]:
question = "What unlawful things did Donald Trump supposedly do?"

In [260]:
docs = vectordb.similarity_search(question,k=5)

In [261]:
len(docs)

5

In [262]:
docs[4].page_content

'Please rise if you are able and show that, Yes, we the United States of America stand with the Ukrainian people. \n\nThroughout our history we’ve learned this lesson when dictators do not pay a price for their aggression they cause more chaos.   \n\nThey keep moving.   \n\nAnd the costs and the threats to America and the world keep rising.   \n\nThat’s why the NATO Alliance was created to secure peace and stability in Europe after World War 2. \n\nThe United States is a member along with 29 other nations. \n\nIt matters. American diplomacy matters. American resolve matters. \n\nPutin’s latest attack on Ukraine was premeditated and unprovoked. \n\nHe rejected repeated efforts at diplomacy. \n\nHe thought the West and NATO wouldn’t respond. And he thought he could divide us at home. Putin was wrong. We were ready.  Here is what we did.   \n\nWe prepared extensively and carefully. \n\nWe spent months building a coalition of other freedom-loving nations from Europe and the Americas to Asi

In [263]:
vectordb.persist()

In [264]:
for doc in docs:
    print(doc.metadata)

{'source': 'data/doc-3.txt'}
{'source': 'data/doc-3.txt'}
{'source': 'data/doc-6.txt'}
{'source': 'data/doc-3.txt'}
{'source': 'data/state_of_the_union.txt'}


In [265]:
docs = vectordb.max_marginal_relevance_search(question,k=5, fetch_k=10)

In [266]:
for doc in docs:
    print(doc.metadata)

{'source': 'data/doc-3.txt'}
{'source': 'data/doc-6.txt'}
{'source': 'data/state_of_the_union.txt'}
{'source': 'data/doc-4.txt'}
{'source': 'data/state_of_the_union.txt'}


In [270]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [295]:
# Wrap our vectorstore
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

In [296]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")
)

In [297]:
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

Don Quixote and Sancho, mounted on a donkey, set out. In their first adventure, Don Quixote mistakes a field of windmills for giants and attempts to fight them but finally concludes that a magician must have turned the giants into windmills. He later attacks a group of monks, thinking that they have imprisoned a princess, and also does battle with a herd of sheep, among other adventures, almost all of which end with Don Quixote, Sancho, or both being beaten. Eventually, Don Quixote acquires a metal washbasin from a barber, which he believes is a helmet once worn by a famous knight, and he later frees a group of convicted criminals.
----------------------------------------------------------------------------------------------------
Document 2:

Don Quixote


In [299]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [303]:
result = qa_chain({"query": question})

In [304]:
result["result"]

"According to the information provided, Donald Trump has been charged with several unlawful actions. These include altering, destroying, mutilating, or concealing objects and documents with classified markings, willful retention of national defense information, and conspiracy to obstruct justice. He has also been accused of instructing an employee to delete Mar-a-Lago security camera footage to prevent it from being turned over to a federal grand jury. It's important to note that these are allegations, and Trump has pleaded not guilty and denied any wrongdoing."

In [305]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [306]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [307]:
result = qa_chain({"query": question})

In [308]:
result["result"]

'Donald Trump is accused of altering, destroying, mutilating, or concealing an object and corruptly altering, destroying, mutilating, or concealing a document, record, or other object. He is also charged with willful retention of national defense information. Thanks for asking!'

In [309]:
result["source_documents"][0]

Document(page_content='POLITICS \nTrump hit with new charges as special counsel expands Mar-a-Lago documents case\nBY ROBERT LEGARE, MELISSA QUINN, KATHRYN WATSON\n\nUPDATED ON: JULY 28, 2023 / 3:02 AM / CBS NEWS\n\nWashington — Prosecutors with special counsel Jack Smith\'s office have added new charges against former President Donald Trump in the case involving documents with classified markings discovered at this Florida resort of Mar-a-Lago, according to court papers filed in federal court Thursday evening.\n\nA superseding indictment unsealed by the Justice Department lists multiple new counts against Trump, including: altering, destroying, mutilating, or concealing an object; and corruptly altering, destroying, mutilating or concealing a document, record or other object; and an additional charge of willful retention of national defense information.\n\nTrump was previously charged with 37 felony counts, including 31 counts of willful retention of classified documents and one count