In [1]:
%pip install --upgrade  langchain langchain-community langchainhub gpt4all chromadb bs4

from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import OllamaEmbeddings, GPT4AllEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import LlamaCpp, Ollama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnablePick

Note: you may need to restart the kernel to use updated packages.


In [2]:
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
# print("loader downloaded")
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

loader downloaded


In [8]:

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OllamaEmbeddings())

question = "What are the approaches to Task Decomposition?"
docs = vectorstore.similarity_search(question)
len(docs)
docs[0]

Document(page_content='Sensory Memory: This is the earliest stage of memory, providing the ability to retain impressions of sensory information (visual, auditory, etc) after the original stimuli have ended. Sensory memory typically only lasts for up to a few seconds. Subcategories include iconic memory (visual), echoic memory (auditory), and haptic memory (touch).', metadata={'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:', 'language': 'en', 'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Auton

In [3]:
# %pip install --upgrade --quiet  llama-cpp-python
!CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 /Users/rlm/miniforge3/envs/llama/bin/pip install -U llama-cpp-python --no-cache-dir

/bin/bash: /Users/rlm/miniforge3/envs/llama/bin/pip: No such file or directory


In [4]:
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
llm = Ollama(model="llama2")
llm.invoke("Simulate a rap battle between Stephen Colbert and John Oliver")

'(The scene is set in a dimly lit underground hip-hop club. The crowd is packed tightly together, cheering and holding up their phones to record the impending rap battle. Stephen Colbert and John Oliver stand facing each other on opposite sides of the stage, both dressed in their best "I\'m a comedian, not a rapper" outfits. The emcee introduces them.)\n\nEmcee: And now, folks, it\'s time for the main event! The one and only Stephen Colbert versus the king of British comedy, John Oliver! Let\'s see who comes out on top in this epic rap battle! (cheers and applause)\n\nStephen Colbert: (clears throat) Yo, John, I heard you\'ve been talking smack about my rhymes. (grinning) Well, let me tell you something, buddy, I may not be the most politically correct guy in the room, but when it comes to rapping, I\'m the real deal! (flexing his muscles for effect)\n\nJohn Oliver: (smirking) Oh, Stephen, you\'re cute. But let me tell you something, my man, I may not look like a typical rapper, but I 

In [11]:
# Prompt
prompt = PromptTemplate.from_template(
    "Summarize the main themes in these retrieved docs: {docs}"
)

# Chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


chain = {"docs": format_docs} | prompt | llm | StrOutputParser()

# Run
question = "What are the approaches to Task Decomposition?"
docs = vectorstore.similarity_search(question)

print(type(docs))
chain.invoke(docs)

'The main themes in these retrieved documents are:\n\n1. Sensory Memory: This theme is focused on the earliest stage of memory, which allows individuals to retain impressions of sensory information (visual, auditory, etc.) after the original stimuli have ended. Subcategories include iconic memory (visual), echoic memory (auditory), and haptic memory (touch).\n2. Tool Use: This theme encompasses the use of tools to augment large language models, such as ChemCrow, which allows for chemistry tools to be used with these models, and Scientific Discovery Agent, a generative agent simulation that demonstrates the potential for large language models to perform scientific discovery tasks. Other proof-of-concept examples include Generative Agents Simulation and Emergent Autonomous Scientific Research Capabilities of Large Language Models.\n3. Component Three: Tool Use - Case Studies, Challenges, Citation, and References. This section provides detailed information on the tool use aspect of the ar

In [5]:
from langchain import hub

rag_prompt = hub.pull("rlm/rag-prompt")
rag_prompt.messages

# Chain
chain = (
    RunnablePassthrough.assign(context=RunnablePick("context") | format_docs)
    | rag_prompt
    | llm
    | StrOutputParser()
)

# Run
chain.invoke({"context": docs, "question": question})

NameError: name 'RunnablePassthrough' is not defined

In [7]:
# Prompt
rag_prompt_llama = hub.pull("rlm/rag-prompt-llama")
rag_prompt_llama.messages

# Chain
chain = (
    RunnablePassthrough.assign(context=RunnablePick("context") | format_docs)
    | rag_prompt_llama
    | llm
    | StrOutputParser()
)

# Run
print(question)
chain.invoke({"context": docs, "question": question})

NameError: name 'RunnablePassthrough' is not defined

In [6]:
retriever = vectorstore.as_retriever()
qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

question = "WHy did DUmbdledore die ?"
print(question)
qa_chain.invoke(question)

NameError: name 'vectorstore' is not defined

In [26]:
# Read text files
from langchain_community.document_loaders import FileLoader

loader = FileLoader("../data/documents/")

WHy did DUmbdledore die ?


"I'm just an AI and do not have access to the personal information or circumstances of Professor Dumbdledore, so I cannot provide a definitive answer to why he died. However, based on the context provided, it is possible that Professor Dumbdledore died due to natural causes such as old age or illness, or possibly an accident or injury related to his work in the wizarding world. Without more information, I cannot provide a conclusive explanation for his death."

In [19]:
import os

from langchain_community.document_loaders import TextLoader

BASE_DIR = '/home/raj/anlp/cmu-rag/data/documents/combined_txt_files_length_normalized/'


docs = []
for file in os.listdir(BASE_DIR):
    # if file.endswith(".txt"):
    print(file)
    if 'schedule'in file:
        continue
    loader = TextLoader(BASE_DIR + file)
    docs.extend(loader.load())
    break

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=2)
all_splits = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(documents=all_splits, embedding=OllamaEmbeddings())

schedules_part_85000
lti_programs_part_0


In [15]:
retriever = vectorstore.as_retriever()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

prompt = PromptTemplate.from_template(
    "Answer questions on CMU from these documents"
)

llm = Ollama(model="llama2")

qa_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [16]:
questions = ['Which course has course number of 24492 ?', ]

qa_chain.invoke(questions)

"\nSure, I'd be happy to help! Can you provide the documents related to Carnegie Mellon University (CMU)? Once I have access to them, I can answer any questions you may have based on the information provided in the documents."

In [20]:
question = questions[0]
docs = vectorstore.similarity_search(question)
len(docs)
docs[0]

chain = {"docs": format_docs} | prompt | llm | StrOutputParser()

# Run
question = "What are the approaches to Task Decomposition?"
doc = vectorstore.similarity_search(question)

print(doc)

# print(type(doc))
chain.invoke(doc)

[Document(page_content='Semester: Fall 2023\nCategory: Modern Languages\nCourse: 82789\nTitle: Guided Research\nUnits: 3-36\nLec/Sec: G\nDays: TBA\nBegin: \nEnd: \nBldg/Room: DNM DNM\nLocation: Pittsburgh, Pennsylvania\nInstructor(s): Instructor TBA\n\nSemester: Fall 2023\nCategory: Modern Languages\nCourse: 82789\nTitle: Guided Research\nUnits: 3-36\nLec/Sec: H\nDays: TBA\nBegin: \nEnd: \nBldg/Room: DNM DNM\nLocation: Pittsburgh, Pennsylvania\nInstructor(s): Instructor TBA', metadata={'source': '/home/raj/anlp/cmu-rag/data/documents/combined_txt_files_length_normalized/schedules_part_85000'}), Document(page_content='Semester: Fall 2023\nCategory: Modern Languages\nCourse: 82599\nTitle: Russian Studies Thesis\nUnits: 3-6\nLec/Sec: A\nDays: TBA\nBegin: \nEnd: \nBldg/Room: DNM DNM\nLocation: Pittsburgh, Pennsylvania\nInstructor(s): Instructor TBA\n\nSemester: Fall 2023\nCategory: Modern Languages\nCourse: 82599\nTitle: Russian Studies Thesis\nUnits: 3-6\nLec/Sec: B\nDays: TBA\nBegin: \nE

"\nI'm happy to help you with your questions about Carnegie Mellon University (CMU) based on the provided documents. Please go ahead and ask your questions, and I'll do my best to provide helpful answers."