In [1]:
!pip install torch bitsandbytes transformers accelerate langchain-huggingface

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install langchain langchain_community langchain_chroma langchain-core langchain-text-splitters bs4

Defaulting to user installation because normal site-packages is not writeable


In [3]:
# For Chroma
!pip install pysqlite3-binary

Defaulting to user installation because normal site-packages is not writeable


In [4]:
from langchain.globals import set_debug

set_debug(True)

In [5]:
import torch
import transformers

def load_model(model_id, HF_TOKEN=None):
  # set quantization configuration to load large model with less GPU memory
  # https://huggingface.co/blog/4bit-transformers-bitsandbytes
  # https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf?usp=sharing
  # bnb_config = transformers.BitsAndBytesConfig(
  #     load_in_4bit=True,
  #     bnb_4bit_compute_dtype=torch.bfloat16,
  #     bnb_4bit_quant_type="nf4",
  #     bnb_4bit_use_double_quant=True
  # )

  tokenizer = transformers.AutoTokenizer.from_pretrained(
      model_id,
      token=HF_TOKEN
  )

  model = transformers.AutoModelForCausalLM.from_pretrained(
      model_id,
      device_map='auto',
      # quantization_config=bnb_config,
      token=HF_TOKEN
  )

  return model, tokenizer

In [6]:
from langchain_huggingface import HuggingFacePipeline

HF_TOKEN = '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'
model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'

model, tokenizer = load_model(model_id, HF_TOKEN)

pipe = transformers.pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,  # langchain expects the full text
    # model parameters, optional, added so it's easier to modify in the future
    temperature=0.6,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max #Default 0.6 Try 0.8
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)
pipe.eos_token_id = [
    pipe.tokenizer.eos_token_id,
    pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
pipe.tokenizer.pad_token_id = pipe.model.config.eos_token_id

llm = HuggingFacePipeline(pipeline=pipe)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

### Construct retriever ###
loader = WebBaseLoader(
    web_paths=("https://dl.acm.org/doi/fullHtml/10.1145/3624062.3624064",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("journal-title", "authorGroup", "abstract", "body")
        )
    ),
)
docs = loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [8]:
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, HumanMessagePromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# https://huggingface.co/spaces/mteb/leaderboard
model_name = "Linq-AI-Research/Linq-Embed-Mistral"
model_kwargs = {'device': device, 'trust_remote_code' : True, 'token': HF_TOKEN}
encode_kwargs = {'normalize_embeddings': False}
embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
retriever = vectorstore.as_retriever()


### Contextualize question ###
contextualize_q_system_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    Given the following conversation and a follow up question,
    rephrase the follow up question to be a standalone question, in its original language.
    Please don't include any commentary; respond with only the standalone question. 

    Let me share a couple examples.

    If you do not see any chat history, you MUST return the "Follow Up Input" as is:
    ```
    Chat History:
    Follow Up Input: How is Lawrence doing?
    Standalone Question:
    How is Lawrence doing?
    ```

    If this is the second question onwards, you should properly rephrase the question like this:
    ```
    Chat History:
    Human: How is Lawrence doing?
    Assistant:
    Lawrence is injured and out for the season.
    Follow Up Input: What was his injury?
    Standalone Question:
    What was Lawrence's injury?
    ```

    Now, with those examples, here is the actual chat history and input question.

    Chat History:
    {chat_history}

    Follow Up Input: {input}

    Standalone Question:
    [your response here]
    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)


### Answer question ###
system_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are an assistant for question-answering tasks.
    Use the following pieces of retrieved context to answer the question.
    If the context is not relevant, ignore the retrieved context. Don't need to incorporate it into your response.
    Instead, please think rationally and answer from your own knowledge base.
    If the context is not relevant, you do not need to state that it's not relevant.
    If you are answering from your own knowledge base, you do not need to state that you are answering from your own knowledge base.
    Please just respond with the answer, no commentary necessary.

    Context (may NOT be relevant): {context}

    Chat History:
    {chat_history} 
    <|eot_id|><|start_header_id|>user<|end_header_id|>
    {input}
    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

You try to use a model that was created with version 3.0.0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

### Statefully manage chat history ###
store = {}


# Session-specific message storage
class SessionStorage:
    def __init__(self):
        self.chat_history = []

    def add_message(self, message):
        self.chat_history.append(message)

    def get_chat_history(self):
        return self.chat_history


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
    return_source_documents=True,
)

In [10]:
result = conversational_rag_chain.invoke(
    {"input": "What is Task Decomposition?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)

print(result["answer"])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory] Entering Chain run with input:
[0m{
  "input": "What is Task Decomposition?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history] Entering Chain run with input:
[0m{
  "input": "What is Task Decomposition?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history > chain:RunnableParallel<chat_history>] Entering Chain run with input:
[0m{
  "input": "What is Task Decomposition?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history > chain:RunnableParallel<chat_history> > chain:load_history] Entering Chain run with input:
[0m{
  "input": "What is Task Decomposition?"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history > chain:RunnableParallel<chat_history> > chain:load_history] [0ms] Exiting Chain run with output:
[0m{
  "output": []
}
[36;1m[1;3m[chain/e

In [11]:
print(result["context"][0])

print(result["context"][0].page_content)
print(result["context"][0].metadata)

page_content='6 Conclusion & Future Work' metadata={'source': 'https://dl.acm.org/doi/fullHtml/10.1145/3624062.3624064'}
6 Conclusion & Future Work
{'source': 'https://dl.acm.org/doi/fullHtml/10.1145/3624062.3624064'}


In [12]:
conversational_rag_chain.invoke(
    {"input": "What are common ways of doing it?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory] Entering Chain run with input:
[0m{
  "input": "What are common ways of doing it?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history] Entering Chain run with input:
[0m{
  "input": "What are common ways of doing it?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history > chain:RunnableParallel<chat_history>] Entering Chain run with input:
[0m{
  "input": "What are common ways of doing it?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history > chain:RunnableParallel<chat_history> > chain:load_history] Entering Chain run with input:
[0m{
  "input": "What are common ways of doing it?"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history > chain:RunnableParallel<chat_history> > chain:load_history] [0ms] Exiting Chain run with output:
[0m[outputs]
[36;1

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[36;1m[1;3m[llm/end][0m [1m[chain:RunnableWithMessageHistory > chain:RunnableBranch > chain:retrieval_chain > chain:RunnableAssign<context> > chain:RunnableParallel<context> > chain:retrieve_documents > chain:RunnableSequence > llm:HuggingFacePipeline] [1.68s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "What are some common methods for task decomposition?",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:RunnableBranch > chain:retrieval_chain > chain:RunnableAssign<context> > chain:RunnableParallel<context> > chain:retrieve_documents > chain:RunnableSequence > parser:StrOutputParser] Entering Parser run with input:
[0m{
  "input": "What are some common methods for task decomposition?"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableWithMessageHistory > chain:RunnableBranch > chain:retrieval_ch

'Task decomposition! According to the chat history, common ways of doing task decomposition in software development involve:\n\n* Breaking down a complex task or problem into smaller, more manageable sub-tasks or components\n* Identifying specific areas of a program that require improvement or optimization, such as:\n\t+ Reducing global memory accesses\n\t+ Minimizing warp divergence\n\nThese approaches aim to make the task more tractable, improve efficiency, and facilitate collaboration among team members or developers.'

In [13]:
conversational_rag_chain.invoke(
    {"input": "Who are the authors of VSCuda?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory] Entering Chain run with input:
[0m{
  "input": "Who are the authors of VSCuda?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history] Entering Chain run with input:
[0m{
  "input": "Who are the authors of VSCuda?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history > chain:RunnableParallel<chat_history>] Entering Chain run with input:
[0m{
  "input": "Who are the authors of VSCuda?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history > chain:RunnableParallel<chat_history> > chain:load_history] Entering Chain run with input:
[0m{
  "input": "Who are the authors of VSCuda?"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history > chain:RunnableParallel<chat_history> > chain:load_history] [0ms] Exiting Chain run with output:
[0m[outputs]
[36;1m[1;3m[chai

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[36;1m[1;3m[llm/end][0m [1m[chain:RunnableWithMessageHistory > chain:RunnableBranch > chain:retrieval_chain > chain:RunnableAssign<context> > chain:RunnableParallel<context> > chain:retrieve_documents > chain:RunnableSequence > llm:HuggingFacePipeline] [1.79s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Who are the authors of VSCuda?",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:RunnableBranch > chain:retrieval_chain > chain:RunnableAssign<context> > chain:RunnableParallel<context> > chain:retrieve_documents > chain:RunnableSequence > parser:StrOutputParser] Entering Parser run with input:
[0m{
  "input": "Who are the authors of VSCuda?"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableWithMessageHistory > chain:RunnableBranch > chain:retrieval_chain > chain:RunnableAssign<context> > chain:

'The authors of VSCuda are Brian Chen, Nafis Mustakin, Alvin Hoang, Sakib Fuad, and Daniel Wong, who are students at the University of California, Riverside.'

In [14]:
conversational_rag_chain.invoke(
    {"input": "What are their emails?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory] Entering Chain run with input:
[0m{
  "input": "What are their emails?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history] Entering Chain run with input:
[0m{
  "input": "What are their emails?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history > chain:RunnableParallel<chat_history>] Entering Chain run with input:
[0m{
  "input": "What are their emails?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history > chain:RunnableParallel<chat_history> > chain:load_history] Entering Chain run with input:
[0m{
  "input": "What are their emails?"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableWithMessageHistory > chain:insert_history > chain:RunnableParallel<chat_history> > chain:load_history] [0ms] Exiting Chain run with output:
[0m[outputs]
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableWi

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[36;1m[1;3m[llm/end][0m [1m[chain:RunnableWithMessageHistory > chain:RunnableBranch > chain:retrieval_chain > chain:RunnableAssign<context> > chain:RunnableParallel<context> > chain:retrieve_documents > chain:RunnableSequence > llm:HuggingFacePipeline] [2.27s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "What are the emails of the authors of VSCuda?",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableWithMessageHistory > chain:RunnableBranch > chain:retrieval_chain > chain:RunnableAssign<context> > chain:RunnableParallel<context> > chain:retrieve_documents > chain:RunnableSequence > parser:StrOutputParser] Entering Parser run with input:
[0m{
  "input": "What are the emails of the authors of VSCuda?"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableWithMessageHistory > chain:RunnableBranch > chain:retrieval_chain > chain:Ru

'According to the given context, the emails of the authors of VSCuda are:\n\n1. Brian Chen - bchen189@ucr.edu\n2. Nafis Mustakin - nmust004@ucr.edu\n3. Alvin Hoang - ahoan055@ucr.edu\n4. Sakib Fuad - sfuad001@ucr.edu\n5. Daniel Wong - danwong@ucr.edu'

In [15]:
print(store)

{'abc123': InMemoryChatMessageHistory(messages=[HumanMessage(content='What is Task Decomposition?'), AIMessage(content='Task decomposition is the process of breaking down a complex task or problem into smaller, more manageable sub-tasks or components. This can help to make the task more tractable, improve efficiency, and facilitate collaboration among team members or developers. In the context of software development, task decomposition can involve identifying specific areas of a program that require improvement or optimization, such as reducing global memory accesses or minimizing warp divergence, and then developing targeted solutions or optimizations to address those specific issues.'), HumanMessage(content='What are common ways of doing it?'), AIMessage(content='Task decomposition! According to the chat history, common ways of doing task decomposition in software development involve:\n\n* Breaking down a complex task or problem into smaller, more manageable sub-tasks or components\