In [2]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()
import os, warnings, logging
warnings.filterwarnings('ignore')
from langchain.vectorstores import Chroma
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chat_models import ChatOpenAI

In [3]:
api_key = os.environ['OPENAI_API_KEY']

In [6]:
loader = WikipediaLoader(query="MKUltra")

In [8]:
documents = loader.load()

In [9]:
len(documents)

24

In [10]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
docs = text_splitter.split_documents(documents)

Created a chunk of size 538, which is longer than the specified 500
Created a chunk of size 525, which is longer than the specified 500
Created a chunk of size 694, which is longer than the specified 500


In [11]:
len(docs)

53

In [12]:
embedding = OpenAIEmbeddings(api_key=api_key)

In [13]:
db = Chroma.from_documents(docs,embedding=embedding,persist_directory="./some_new_mkultra")

In [14]:
db.persist()

In [15]:
chat_model = ChatOpenAI(api_key=api_key,temperature=0)

In [16]:
question = "When was this declassified?"

In [26]:
retriever_from_llm = MultiQueryRetriever.from_llm(retriever=db.as_retriever(),llm=chat_model)

In [28]:
# Logging behind the scenes
logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [30]:
# This will not directly answer any query
# It will just return the top N documents which are the most similar
unique_docs = retriever_from_llm.get_relevant_documents(question)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. What is the date of the declassification of this information?', '2. Can you provide the specific time when this was declassified?', '3. Do you have the exact date when this information became declassified?']


In [32]:
len(unique_docs)

5

In [34]:
print(unique_docs[0].page_content)

== Background ==
By the early years of the 1970s, a series of troubling revelations had appeared in the press concerning intelligence activities. First came the revelations by Army intelligence officer Christopher Pyle in January 1970 of the US Army's spying on the civilian population and Senator Sam Ervin's Senate investigations produced more revelations. Then on December 22, 1974, The New York Times published a lengthy article by Seymour Hersh detailing operations engaged in by the CIA over the years that had been dubbed the "family jewels". Covert action programs involving assassination attempts on foreign leaders and covert attempts to subvert foreign governments were reported for the first time. In addition, the article discussed efforts by intelligence agencies to collect information on the political activities of US citizens.
The creation of the Church Committee was approved on January 27, 1975, by a vote of 82 to 4 in the Senate.


== Overview ==
The Church Committee's final re