In [1]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import os
from dotenv import load_dotenv
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

load_dotenv()
llm_local_path = "../models/mistral-7b-openorca.Q4_0.gguf"  

In [2]:
template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [3]:
# # Callbacks support token-wise streaming
# callbacks = [StreamingStdOutCallbackHandler()]
# 
# # Verbose is required to pass to the callback manager
# llm_gpt4all = GPT4All(model=local_path, callbacks=callbacks, verbose=True)

In [4]:
# llm_chain_gpt4all = LLMChain(prompt=prompt, llm=llm_gpt4all)

In [5]:
# question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"
# 
# llm_chain_gpt4all.run(question)

In [3]:
from langchain.callbacks.manager import CallbackManager
from langchain.llms import LlamaCpp

llm = LlamaCpp(
    model_path=llm_local_path,
    n_gpu_layers=40,
    n_batch=256,
    n_ctx=2048,
    f16_kv=True,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=True,
    streaming=False
)

# llm = OpenAI(openai_api_key=os.getenv('OPENAI_API_KEY'))

ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce GTX 1660 SUPER, compute capability 7.5
llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from ../models/mistral-7b-openorca.Q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32002,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q4_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_0     [  4096, 14336,     

In [4]:
llm_chain_cpp = LLMChain(prompt=prompt, llm=llm)

In [5]:
llm_chain_cpp.run("Tell me which is better - to live or not to live?")


llama_print_timings:        load time =   300.25 ms
llama_print_timings:      sample time =   197.25 ms /   256 runs   (    0.77 ms per token,  1297.81 tokens per second)
llama_print_timings: prompt eval time =   300.18 ms /    29 tokens (   10.35 ms per token,    96.61 tokens per second)
llama_print_timings:        eval time =  8557.52 ms /   255 runs   (   33.56 ms per token,    29.80 tokens per second)
llama_print_timings:       total time =  9745.24 ms


' First of all, what does it mean to live? It means to be active in the world, to be part of life and its processes, to grow, develop, learn from mistakes and victories, to communicate with others, to love and care for those dear to us, to contribute to the well-being of society. And "not to live" implies a retreat from all this, a cessation of activity, disconnection from life, renunciation of personal and social interests, an inability or unwillingness to develop and grow.\n\nNow let\'s compare these two conditions. If we choose to live, we will be able to learn, develop, communicate, love, care, share experiences, find meaning in life, contribute to society, have a chance to realize our potential. In other words, living is an active, rich, multifaceted process that provides us with opportunities for growth and the possibility of creating value both for ourselves and for others.\n\nOn the contrary, if we choose not to live, we deprive ourselves of all these possibilities, leaving onl

In [6]:
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate

system_template = """Create an informative and comprehensive answer for a given question based solely on the given documents. You must only use information from the given documents.
Use an unbiased and journalistic tone. Do not repeat text.
Cite the documents using [Document name] notation.
If multiple documents contain the answer, cite those documents like ‘as stated in [Document name 1], [Document name 2], etc.’.
You must include citations in your answer.
If the documents do not contain the answer to the question, say that ‘Answering is not possible given the available information.’
{context}
"""
messages = [
        SystemMessagePromptTemplate.from_template(system_template),
        HumanMessagePromptTemplate.from_template("{question}"),
    ]
prompt = ChatPromptTemplate.from_messages(messages)

In [22]:
# import embeddings
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from torch import cuda


device = f"cuda:{cuda.current_device()}" if cuda.is_available() else "cpu"

embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/msmarco-distilbert-dot-v5",
    model_kwargs={"device": device},
    encode_kwargs={"device": device, "batch_size": 32},
)

faiss_local_path = '../data/embedded_dataset/faiss/local_500/faiss_idx'
# embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
# embeddings = get_local_embeddings()
db = FAISS.load_local(faiss_local_path, embed_model)

In [23]:
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

conversation_chain = ConversationalRetrievalChain.from_llm(
        llm,
        db.as_retriever(search_kwargs={"k": 3}),
        memory=ConversationBufferMemory(
        memory_key="chat_history", return_messages=True, output_key="answer"
    ),
        return_source_documents=True,
        combine_docs_chain_kwargs={
            "document_prompt": PromptTemplate(
                input_variables=["page_content", "Source"],
                template="Document name: {Source}\nContext:\n{page_content}",
            ),
            "prompt": prompt,
        },
    )

In [24]:
db.similarity_search('What are the problems with the military standard IP specification?')

[Document(page_content='3.  Problems with MIL Standard IP', metadata={'Source': 'rfc963', 'Title': 'Some problems with the specification of the Military Standard Internet Protocol ', 'Updates': None, 'Obsoletes': None, 'Category': None, 'ISSN': None, 'Updated by': None, 'NIC': None, 'Obsoleted by': None, 'Related RFCs': None}),
 Document(page_content='RFC 964:  Some problems with the specification of the Military Standard Transmission Control Protocol', metadata={'Source': 'rfc964', 'Title': 'Some problems with the specification of the Military Standard Transmission Control Protocol ', 'Updates': None, 'Obsoletes': None, 'Category': None, 'ISSN': None, 'Updated by': None, 'NIC': None, 'Obsoleted by': None, 'Related RFCs': None}),
 Document(page_content='In our discussion above, we have pointed out several serious problems\n   with the Military Standard IP [MILS83a] specification which must be\n   corrected to produce a running implementation conforming to this\n   standard.  We have pr

In [30]:
response = conversation_chain('What are the problems with the military standard IP specification?')

Llama.generate: prefix-match hit

llama_print_timings:        load time =   300.25 ms
llama_print_timings:      sample time =    13.10 ms /    16 runs   (    0.82 ms per token,  1221.56 tokens per second)
llama_print_timings: prompt eval time =   965.21 ms /   256 tokens (    3.77 ms per token,   265.23 tokens per second)
llama_print_timings:        eval time =   590.89 ms /    16 runs   (   36.93 ms per token,    27.08 tokens per second)
llama_print_timings:       total time =  1632.18 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =   300.25 ms
llama_print_timings:      sample time =    86.78 ms /   111 runs   (    0.78 ms per token,  1279.16 tokens per second)
llama_print_timings: prompt eval time =   773.00 ms /   209 tokens (    3.70 ms per token,   270.38 tokens per second)
llama_print_timings:        eval time =  4115.96 ms /   110 runs   (   37.42 ms per token,    26.73 tokens per second)
llama_print_timings:       total time =  5382.24 ms


In [31]:
response['answer']

'\n\nAnswer: In the context of communication, it can be crucial to provide additional information or details for clarity and understanding. As stated in [Document name: rfc8088], more elaborate or other definitions are used when discussing or explaining something. This is because, as mentioned in [Document name: rfc677], some descriptions may not be elaborated enough, potentially leading to confusion. To summarize the importance of elaboration from [Document name: rfc2911]: "To summarize:".'

In [32]:
response['source_documents']

[Document(page_content='more elaborate or other definitions are used.', metadata={'Source': 'rfc8088', 'Title': 'How to Write an RTP Payload Format', 'Updates': ['2736'], 'Obsoletes': None, 'Category': 'Informational', 'ISSN': [], 'Updated by': None, 'NIC': None, 'Obsoleted by': None, 'Related RFCs': None}),
 Document(page_content='elaborate than necessary.', metadata={'Source': 'rfc677', 'Title': 'Maintenance of duplicate databases ', 'Updates': None, 'Obsoletes': None, 'Category': None, 'ISSN': None, 'Updated by': None, 'NIC': [], 'Obsoleted by': None, 'Related RFCs': None}),
 Document(page_content='To summarize:', metadata={'Source': 'rfc2911', 'Title': 'Internet Printing Protocol/1.1: Model and Semantics ', 'Updates': None, 'Obsoletes': ['2566'], 'Category': 'Standards Track', 'ISSN': None, 'Updated by': ['3380', '3382', '3996', '3995', '7472'], 'NIC': None, 'Obsoleted by': ['8011'], 'Related RFCs': None})]

In [33]:
response

{'question': 'What are the problems with the military standard IP specification?',
 'chat_history': [HumanMessage(content='Can you elaborate?', additional_kwargs={}, example=False),
  AIMessage(content='\nAI: The term "can you elaborate" is not explicitly defined in any of the provided documents, but it seems to be a request for more information or clarification [Document rfc677]. Asking for elaboration may help someone better understand a specific topic or concept [Document rfc8088]. If a summary needs to be provided, it can sometimes be an efficient way of obtaining the necessary understanding [Document rfc2911].', additional_kwargs={}, example=False),
  HumanMessage(content='What are the problems with the military standard IP specification?', additional_kwargs={}, example=False),
  AIMessage(content='\n\nAnswering the question, as stated in [RFC 963] and [RFC 964], there are several issues or challenges associated with the Military Standard IP specification. The challenges include l