In [15]:
    from langchain.embeddings import HuggingFaceEmbeddings
    from langchain.vectorstores import FAISS
    from langchain.llms import LlamaCpp
    from langchain.chains import LLMChain,RetrievalQA, ConversationalRetrievalChain
    import streamlit as st
    from langchain.callbacks.manager import CallbackManager
    from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
    from langchain.memory import ConversationBufferMemory


In [16]:
DB_FAISS_PATH = 'vectorstore/db_faiss'

In [27]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",
                                       model_kwargs={'device': 'cpu'})

In [28]:
db = FAISS.load_local(DB_FAISS_PATH, embeddings)

In [29]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [30]:
llm = LlamaCpp(
        model_path="./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin",
        input={"temperature": 0.75, "max_length": 2000, "top_p": 1},
        #n_gpu_layers=n_gpu_layers,
        #n_batch=n_batch,
        n_ctx=2048,
        f16_kv=True,
        callback_manager=callback_manager,
        verbose=True)

llama.cpp: loading model from ./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 15 (mostly Q4_K - Medium)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 4289.33 MB (+ 1024.00 MB per state)
llama_new_context_with_model: kv self size  = 102

In [31]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

retriever= db.as_retriever(search_kwargs={"k": 3, "search_type": "similarity"})
    

In [32]:

chain = ConversationalRetrievalChain.from_llm(llm=llm, chain_type="stuff", retriever=retriever,return_source_documents = False, verbose = False, memory=memory)
    

In [33]:
question="what are the traits of an explainable AI system?" 

In [36]:
response = chain(({"question":question }))
print(response["answer"])   

Llama.generate: prefix-match hit


 What is the purpose of providing transparency into how an AI system arrived at a particular decision or prediction?


llama_print_timings:        load time = 10262.25 ms
llama_print_timings:      sample time =    17.32 ms /    24 runs   (    0.72 ms per token,  1385.28 tokens per second)
llama_print_timings: prompt eval time = 375661.71 ms /   298 tokens ( 1260.61 ms per token,     0.79 tokens per second)
llama_print_timings:        eval time = 754492.15 ms /    23 runs   (32804.01 ms per token,     0.03 tokens per second)
llama_print_timings:       total time = 1130450.62 ms
Llama.generate: prefix-match hit


 The purpose of providing transparency into how an AI system arrived at a particular decision or prediction is to build trust in the model by providing human-understandable rationales for its behavior. This allows humans to judge whether the explanation is justified and to understand the reasoning behind the model's predictions, ultimately leading to increased trust in the model. The purpose of providing transparency into how an AI system arrived at a particular decision or prediction is to build trust in the model by providing human-understandable rationales for its behavior. This allows humans to judge whether the explanation is justified and to understand the reasoning behind the model's predictions, ultimately leading to increased trust in the model.



llama_print_timings:        load time = 10262.25 ms
llama_print_timings:      sample time =    51.28 ms /    73 runs   (    0.70 ms per token,  1423.64 tokens per second)
llama_print_timings: prompt eval time = 1475859.21 ms /   597 tokens ( 2472.13 ms per token,     0.40 tokens per second)
llama_print_timings:        eval time = 687155.46 ms /    72 runs   ( 9543.83 ms per token,     0.10 tokens per second)
llama_print_timings:       total time = 2163777.50 ms
