In [2]:
import os
import re

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS

from langchain import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import CTransformers
from langchain.chains import RetrievalQA
from langchain.llms import LlamaCpp
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.manager import CallbackManager
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
import llama_cpp
import time


In [3]:
custom_prompt_template = """ Use the following pieces of information to answer the user's question.
If you don't know the answer, please just say that you don't know the answer, don't try to make up
an answer. 

Context : {context}
Question : {question}

The answer should consist of at least 1 sentence for short questions or 7 sentences for more detailed qeustions. Only returns the helpful and reasonable answer below and nothing else.
No need to return the question. I just want answer. Please don't show unhelpful answers.
Helpful answer:
"""

In [4]:
def set_custom_prompt(custom_prompt_template):
    prompt = PromptTemplate(template=custom_prompt_template, input_variables=['context',
                                                                              'question'])
    return prompt

In [5]:
def load_llm():
    n_gpu_layers = 32  # Change this value based on your model and your GPU VRAM pool.
    n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    llm = LlamaCpp(
        model_path="/home/sira/sira_project/meta-Llama2/llama-2-7b-chat.ggmlv3.q8_0.bin",
        n_gpu_layers=n_gpu_layers,
        n_batch=n_batch,
        callback_manager=callback_manager,
        verbose=True,n_ctx = 4096, temperature = 0.1, max_tokens = 4096, 
    )
    return llm


def load_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name = "thenlper/gte-base",
                                       model_kwargs = {'device': 'cpu'})
    return embeddings

In [6]:
def check_duplicate(source_list):
    res = []
    for i in source_list:
        if i not in res:
            res.append(i)
    return res

def convert_to_website_format(urls):
    convert_urls = []
    for url in urls:
        # Remove any '.html' at the end of the URL
        url = re.sub(r'\.html$', '', url)
        # Check if the URL starts with 'www.' or 'http://'
        if not re.match(r'(www\.|http://)', url):
            url = 'https://' + url
        if '/index' in url:
            url = url.split('/index')[0]
        match = re.match(r'^([^ ]+)', url)
        if match:
            url = match.group(1)
        convert_urls.append(url)
    return convert_urls

def regex_source(answer):
    pattern = r"'source': '(.*?)'"
    matchs = re.findall(pattern, str(answer))
    convert_urls = convert_to_website_format(matchs)
    res_urls = check_duplicate(source_list=convert_urls)
    #res_urls = filter_similar_url(res_urls)
    return res_urls

def filter_similar_url(urls):
    urls_remove = ["www.omniscien.com/aboutus/company","www.omniscien.com/lsev6/asr/automatic-speech-recognition-overview", "www.omniscien.com/lsev6/features/asr/autonomous-speech-recognition-overview","www.omniscien.com/lsev6/asr"]
    # Remove the URL from the list
    filtered_urls = [url for url in urls if url not in  urls_remove]
    return filtered_urls

In [7]:
def filter_search(db_similarity, diff_val):
    filter_list = []
    top_score = db_similarity[0][1]
    for index, score in enumerate(db_similarity) :
        if score[1] - top_score <= diff_val:
              filter_list.append(score)
    return filter_list  

In [9]:
DB_FAISS_PATH = "/home/sira/sira_project/meta-Llama2/vectorstores_clean_doc_gte-base/db_faiss"
embeddings = load_embeddings()
db = FAISS.load_local(DB_FAISS_PATH, embeddings)
llm = load_llm()
qa_prompt = set_custom_prompt(custom_prompt_template)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, input_key="query", output_key="result")
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = db.as_retriever(search_kwargs = {'k':3}), 
    return_source_documents = True,
    memory = memory,
    chain_type_kwargs = {"prompt":qa_prompt}) 


# diff_val = st.slider(label ='Select a diff value',
#                    min_value = 0.00, 
#                    max_value = 1.00, 
#                    step = 0.01, value = 0.01, format = "%f")

  from .autonotebook import tqdm as notebook_tqdm
llama.cpp: loading model from /home/sira/sira_project/meta-Llama2/llama-2-7b-chat.ggmlv3.q8_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 4096
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 7 (mostly Q8_0)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 7354.73 MB (+ 2048.0

In [9]:
history_log = []
query = "Who is Dion Wiggins"
start = time.time()
#db_similarity = db.similarity_search_with_score(query, k=10)
#filter_list = filter_search(db_similarity, diff_val)
response = qa_chain({'query': query})
print(response["result"])
urls = regex_source(response)
for count, url in enumerate(urls):
    print(str(count+1)+":", url)
end = time.time()
print("Respone time:",int(end-start),"sec")
test = str(llama_cpp.llama_print_timings(ctx))
print(llama_cpp.llama_print_timings(ctx))
history_log.append(memory.load_memory_variables({})["chat_history"])

Dion Wiggins is a highly experienced ICT industry visionary, entrepreneur, analyst, and consultant. He has an impressive knowledge in the fields of software development, architecture, and management, as well as an in-depth understanding of Asian ICT markets. As the Chief Technology Officer and Co-Founder of Omniscien, he has advised literally hundreds of enterprises on their ICT strategy

: 

: 

In [25]:
test

'None'

In [11]:
import llama_cpp
ctx = llm.client.ctx 
str(llama_cpp.llama_print_timings(ctx))


llama_print_timings:        load time =   406.28 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 35761.76 ms


'None'

In [13]:
import sys
original_stdout = sys.stdout # Save a reference to the original standard output
with open('filename.txt', 'w') as f:
    sys.stdout = f # Change the standard output to the file we created.
    print(llama_cpp.llama_print_timings(ctx))
    sys.stdout = original_stdout # Reset the standard output to its original value


llama_print_timings:        load time =   406.28 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 369650.76 ms


In [15]:
import sys

print('This message will be displayed on the screen.')

with open('filename.txt', 'w') as f:
    print(llama_cpp.llama_print_timings(ctx), file=f)

This message will be displayed on the screen.



llama_print_timings:        load time =   406.28 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 457634.76 ms


In [22]:
from io import StringIO # Python3 use: from io import StringIO
import sys

old_stdout = sys.stdout
sys.stdout = mystdout = StringIO()

print("show me ")

sys.stdout = old_stdout

In [23]:
mystdout.getvalue()

'show me \n'

In [26]:
import io
from contextlib import redirect_stdout

with io.StringIO() as buf, redirect_stdout(buf):
    print(llama_cpp.llama_print_timings(ctx))
    output = buf.getvalue()


llama_print_timings:        load time =   406.28 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 720718.76 ms


In [30]:
from io import StringIO
import sys


buffer = StringIO()
sys.stdout = buffer

print(llama_cpp.llama_print_timings(ctx))
print_output = buffer.getvalue()

# 👇️ restore stdout to default for print()
sys.stdout = sys.__stdout__

# 👇️ -> This will be stored in the print_output variable
print('->', print_output)



llama_print_timings:        load time =   406.28 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 918551.76 ms


-> None

INFO:root:hello world
ERROR:root:you will see this
CRITICAL:root:critical is logged too!

INFO:root:hello world
ERROR:root:you will see this
CRITICAL:root:critical is logged too!

None
Command output :  b'\xe0\xb8\xad. 29 \xe0\xb8\xaa.\xe0\xb8\x84. 2566 17:54:40 +07\n'
Command exit status/return code :  0
None
None
GeeksforGeeks
None

something
something
something
None


[1m> Entering new AgentExecutor chain...[0m
 I should search for information about this person.
Action: [searx_search] "Dion Wiggins"
Action Input: "Dion Wiggins news"[32;1m[1;3m I should search for information about this person.
Action: [searx_search] "Dion Wiggins"
Action Input: "Dion Wiggins news"[0m
Observation: [searx_search] "Dion Wiggins" is not a valid tool, try another one.
Thought: I will try Google instead.
Action: [Google search] "Dion Wiggins"
Action Input: "Dion Wiggins biography"[32;1m[1;3m I will try Google instead.
Action: [Google search] "Dion Wiggins"
Action Input: "Dion Wiggins biogr

In [32]:
print(log_stream.getvalue())

In [84]:
import llama_cpp
llama_cpp.llama_print_system_info()
llama_cpp.llama_reset_timings(ctx)

In [48]:
test = str(llama_cpp.llama_print_timings(ctx))


llama_print_timings:        load time =   406.28 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 1817888.76 ms


In [49]:
print(test)

In [53]:
# llm("Some example phrase")
test = str(llama_cpp.llama_print_timings(ctx))


llama_print_timings:        load time =   406.28 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 56496055.76 ms


In [51]:
test

'None'

In [86]:
import sys
 
def print_to_stderr():
    print(llama_cpp.llama_print_timings(ctx), file=sys.stderr)
 
test = print_to_stderr()

None

llama_print_timings:        load time =   406.28 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 64420314.76 ms


In [88]:
print(test)

In [65]:
# Assuming you have loaded the library and set up _lib

# Store the original stderr so we can restore it later
original_stderr = sys.stderr

# Create a StringIO object to capture the printed output
output_buffer = io.StringIO()

# Redirect the stderr to the buffer
sys.stderr = output_buffer

# Call the function to print system info (which will be captured)
llama_cpp.llama_print_timings(ctx)

# Restore the original stderr
sys.stderr = original_stderr

# Get the captured output as a string
output_string = output_buffer.getvalue()

# Close the StringIO buffer
output_buffer.close()

# Now you can use the output_string as needed
print(output_string)


llama_print_timings:        load time =   406.28 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 57447777.76 ms


In [66]:
output_string

''

In [71]:
import logging
from tqdm import trange
from tqdm.contrib.logging import logging_redirect_tqdm

LOG = logging.getLogger(__name__)

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    with logging_redirect_tqdm():
        test= llama_cpp.llama_print_timings(ctx)
        LOG.info("console logging redirected to `tqdm.write()`")
    # logging restored


console logging redirected to `tqdm.write()`
llama_print_timings:        load time =   406.28 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 63393832.76 ms


In [80]:
%%capture
print('hi, stdout')
print('hi, stderr', file=sys.stderr)

In [82]:
%%capture cap --no-stderr
print(llama_cpp.llama_print_timings(ctx))
print("hello, stderr", file=sys.stderr)

hello, stderr

llama_print_timings:        load time =   406.28 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 64312577.76 ms


In [85]:
cap.outputs

[]

In [79]:
%%capture cap --no-stderr
def print_something():
    print(llama_cpp.llama_print_timings(ctx))
print_something()

with open('filename.txt', "w") as f:
    f.write(cap.stdout)



llama_print_timings:        load time =   406.28 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 63997717.76 ms


In [16]:
import io
import sys

# Create a StringIO object to capture the printed output
output_buffer = io.StringIO()

# Store the original stdout so we can restore it later
original_stdout = sys.stdout

# Redirect the standard output to the buffer
sys.stdout = output_buffer

# Call the function to print timings (which will be captured)
llama_cpp.llama_print_timings(ctx)

# Restore the original stdout
sys.stdout = original_stdout

# Get the captured output as a string
output_string = output_buffer.getvalue()

# Close the StringIO buffer
output_buffer.close()

# Now you can use the output_string as needed
print(output_string)







llama_print_timings:        load time = 55448.79 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 319119.90 ms


In [17]:
output_string

''

In [22]:
from subprocess import Popen, PIPE
p = Popen(llama_cpp.llama_print_timings(ctx), stdout=PIPE)


llama_print_timings:        load time = 55448.79 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 570341.90 ms


TypeError: 'NoneType' object is not iterable

In [None]:
import subprocess
proc = subprocess.Popen(llama_cpp.llama_print_timings(ctx)],
    stdout=subprocess.PIPE)
out = proc.communicate()[0]

In [43]:
r1 = hasattr(llama_cpp.llama_print_timings(ctx), "name")


llama_print_timings:        load time = 55448.79 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 2510018.90 ms


In [46]:
llama_cpp.llama_timings._fields_

[('t_start_ms', ctypes.c_double),
 ('t_end_ms', ctypes.c_double),
 ('t_load_ms', ctypes.c_double),
 ('t_sample_ms', ctypes.c_double),
 ('t_p_eval_ms', ctypes.c_double),
 ('t_eval_ms', ctypes.c_double),
 ('n_sample', ctypes.c_int),
 ('n_p_eval', ctypes.c_int),
 ('n_eval', ctypes.c_int)]

In [50]:
llama_cpp.llama_get_timings(ctx)

<llama_cpp.llama_cpp.llama_timings at 0x7f8f140d5040>

In [26]:
printif(llama_cpp.llama_print_timings(ctx))

NameError: name 'printif' is not defined

In [25]:
getattr(llama_cpp.llama_print_timings(ctx), "name")


llama_print_timings:        load time = 55448.79 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 759534.90 ms


AttributeError: 'NoneType' object has no attribute 'name'

In [21]:
for line in iter(llama_cpp.llama_print_timings(ctx),""):
     print(line)


llama_print_timings:        load time = 55448.79 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 468274.90 ms


TypeError: iter(v, w): v must be callable

In [27]:
import contextlib
import io 

captured_output = io.StringIO()

with contextlib.redirect_stdout(captured_output):
    llama_cpp.llama_print_timings(ctx)

captured_string = captured_output.getvalue()
#print(captured_string)
    


llama_print_timings:        load time = 55448.79 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 1436956.90 ms


In [28]:
captured_string

''

In [55]:
str(llama_cpp.llama_print_timings(ctx))


llama_print_timings:        load time = 139395.49 ms
llama_print_timings:      sample time =    51.38 ms /    92 runs   (    0.56 ms per token,  1790.65 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time = 30758.58 ms /    92 runs   (  334.33 ms per token,     2.99 tokens per second)
llama_print_timings:       total time = 1184752.14 ms


'None'

In [47]:
help(pow)

Help on built-in function pow in module builtins:

pow(base, exp, mod=None)
    Equivalent to base**exp with 2 arguments or base**exp % mod with 3 arguments
    
    Some types, such as ints, are able to use a more efficient algorithm when
    invoked using the three argument form.



In [60]:
def llama_print_timings(ctx: llama_context_p):
    _lib.llama_print_timings(ctx)


_lib.llama_print_timings.argtypes = [ctx]
_lib.llama_print_timings.restype = str

NameError: name 'llama_context_p' is not defined

In [62]:
llama_cpp.llama_print_timings(ctx)


llama_print_timings:        load time = 139395.49 ms
llama_print_timings:      sample time =    51.38 ms /    92 runs   (    0.56 ms per token,  1790.65 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time = 30758.58 ms /    92 runs   (  334.33 ms per token,     2.99 tokens per second)
llama_print_timings:       total time = 1490809.14 ms


AttributeError: 'NoneType' object has no attribute 'restype'

In [67]:
v1 = llama_cpp.llama_print_system_info()

In [68]:
v1

b'AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | '

In [66]:
llama_cpp.llama_n_ctx(ctx)

4096

In [69]:
llama_cpp.llama_print_timings.restype = c_char_p

In [70]:
t1 = llama_cpp.llama_print_timings(ctx)


llama_print_timings:        load time = 139395.49 ms
llama_print_timings:      sample time =    51.38 ms /    92 runs   (    0.56 ms per token,  1790.65 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time = 30758.58 ms /    92 runs   (  334.33 ms per token,     2.99 tokens per second)
llama_print_timings:       total time = 2445976.14 ms


In [42]:
llama_cpp.llama_time_us()

2756692070

In [40]:
llama_cpp.llama_get_timings(ctx)

<llama_cpp.llama_cpp.llama_timings at 0x7f8f237e0340>

In [71]:
t1

In [29]:
import os
a  = os.popen('pwd').readlines()

In [30]:
a

['/home/sira/sira_project/meta-Llama2\n']

In [31]:
from subprocess import run
output = run("pwd", capture_output=True).stdout

In [32]:
output

b'/home/sira/sira_project/meta-Llama2\n'

In [33]:
import subprocess
import tempfile

with tempfile.TemporaryFile() as tempf:
    proc = subprocess.Popen(['echo', 'a', 'b'], stdout=tempf)
    proc.wait()
    tempf.seek(0)
    print(tempf.read())

b'a b\n'


In [34]:
import subprocess
cmd = [ 'echo', 'arg1', 'arg2' ]
output = subprocess.Popen( cmd, stdout=subprocess.PIPE ).communicate()[0]
print(output)
arg1 arg2


SyntaxError: invalid syntax (3168670265.py, line 5)

In [38]:
import subprocess
cmd = ('date', '-u', '+%A')

p = subprocess.run(cmd, capture_output=True, text=True)
p.CompletedProcess(args=('date', '-u', '+%A'), returncode=0,
                 stdout='Wednesday\n', stderr='')
p.stdout
'Wednesday\n'

subprocess.check_output(cmd, text=True)
'Wednesday\n'

AttributeError: 'CompletedProcess' object has no attribute 'CompletedProcess'

In [89]:
from langchain.tools import BraveSearch

981

In [139]:
from langchain.tools import DuckDuckGoSearchRun
search = DuckDuckGoSearchRun(verbose = True)

In [140]:
search.run("Who is Dion from Omniscien technology")

'Dion Wiggins Chief Technology Officer at Omniscien Technologies Published Jan 12, 2023 + Follow During my time and Vice President and Research Director at Gartner, many approaches were used... Dion Wiggins CTO, Co-Founder, Omniscien Technologies Rise of the Machines: Balancing Language-Related AI Opportunities and Risks Securing the Future of Generative AI, Machine Translation, Speech Recognition, NLP and AI Augmented Processes Tuesday 25 April 2023 Watch the Replay Animals and Pets Anime Art Cars and Motor Vehicles Crafts and DIY Culture, Race, and Ethnicity Ethics and Philosophy Fashion Food and Drink History Hobbies Law Learning and Education Military Movies Music Place Podcasts and Streamers Politics Programming Reading, Writing, and Literature Religion and Spirituality Science Tabletop Games ... Omniscien Technologies (formerly Asia Online) is a privately owned company delivering machine translation and language processing software and services. The company is backed by individua

In [105]:
from langchain.tools.base import BaseTool
from langchain.pydantic_v1 import Field
class DuckDuckGoSearchResults(BaseTool):
    """Tool that queries the DuckDuckGo search API and gets back json."""

    name: str = "DuckDuckGo Results JSON"
    description: str = (
        "A wrapper around Duck Duck Go Search. "
        "Useful for when you need to answer questions about current events. "
        "Input should be a search query. Output is a JSON array of the query results"
    )
    num_results: int = 4
    api_wrapper: DuckDuckGoSearchAPIWrapper = Field(
        default_factory=DuckDuckGoSearchAPIWrapper
    )
    backend: str = "api"

    def _run(
        self,
        query: str,
        run_manager: Optional[CallbackManagerForToolRun] = None,
    ) -> str:
        """Use the tool."""
        res = self.api_wrapper.results(query, self.num_results, backend=self.backend)
        res_strs = [", ".join([f"{k}: {v}" for k, v in d.items()]) for d in res]
        return ", ".join([f"[{rs}]" for rs in res_strs])

ModuleNotFoundError: No module named 'langchain.pydantic_v1'

In [141]:
from langchain.tools import DuckDuckGoSearchResults
search = DuckDuckGoSearchResults(verbose = True)
search.run("Who is Dion from Omniscien technology")

'[snippet: Dion Wiggins Chief Technology Officer at Omniscien Technologies Published Jan 12, 2023 + Follow During my time and Vice President and Research Director at Gartner, many approaches were used..., title: AI, NLP, Speech Recognition and Machine Translation ... - LinkedIn, link: https://www.linkedin.com/pulse/ai-nlp-speech-recognition-machine-translation-next-chapter-wiggins], [snippet: Dion Wiggins CTO, Co-Founder, Omniscien Technologies Rise of the Machines: Balancing Language-Related AI Opportunities and Risks Securing the Future of Generative AI, Machine Translation, Speech Recognition, NLP and AI Augmented Processes Tuesday 25 April 2023 Watch the Replay, title: Webinars - Omniscien Technologies, link: https://omniscien.com/resources/webinars/], [snippet: Animals and Pets Anime Art Cars and Motor Vehicles Crafts and DIY Culture, Race, and Ethnicity Ethics and Philosophy Fashion Food and Drink History Hobbies Law Learning and Education Military Movies Music Place Podcasts and

In [96]:
from langchain.utilities import DuckDuckGoSearchAPIWrapper

wrapper = DuckDuckGoSearchAPIWrapper(region="de-de", time="d", max_results=2)

In [97]:
search = DuckDuckGoSearchResults(api_wrapper=wrapper, backend="news")

In [144]:
search.run("Who is Dion from Omniscien technology")

'[snippet: Dion Wiggins Chief Technology Officer at Omniscien Technologies Published Jan 12, 2023 + Follow During my time and Vice President and Research Director at Gartner, many approaches were used..., title: AI, NLP, Speech Recognition and Machine Translation ... - LinkedIn, link: https://www.linkedin.com/pulse/ai-nlp-speech-recognition-machine-translation-next-chapter-wiggins], [snippet: Animals and Pets Anime Art Cars and Motor Vehicles Crafts and DIY Culture, Race, and Ethnicity Ethics and Philosophy Fashion Food and Drink History Hobbies Law Learning and Education Military Movies Music Place Podcasts and Streamers Politics Programming Reading, Writing, and Literature Religion and Spirituality Science Tabletop Games ..., title: webinar with Omniscien CEO Dion Wiggins and Chief Scientist ... - Reddit, link: https://www.reddit.com/r/machinetranslation/comments/ywrm8f/webinar_with_omniscien_ceo_dion_wiggins_and_chief/], [snippet: Dion Wiggins CTO, Co-Founder, Omniscien Technologies

In [146]:
from langchain.utilities import DuckDuckGoSearchAPIWrapper
search.run("Who is Dion from Omniscien technology", x)

NameError: name 'x' is not defined

In [None]:
from langchain.chains import RetrievalQAWithSourcesChain
user_input = "How do LLM Powered Autonomous Agents work?"
qa_chain = RetrievalQAWithSourcesChain.from_chain_type(llm,retriever=web_research_retriever)
result = qa_chain({"question": user_input})
result

In [106]:
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from langchain.llms import LlamaCpp

def load_llm():
    n_gpu_layers = 32  # Change this value based on your model and your GPU VRAM pool.
    n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    llm = LlamaCpp(
        #model_path="llama-2-7b-chat.ggmlv3.q8_0.bin",
        model_path="/home/sira/sira_project/meta-Llama2/llama-2-7b-chat.ggmlv3.q8_0.bin",
        n_gpu_layers=n_gpu_layers,
        n_batch=n_batch,
        callback_manager=callback_manager,
        verbose=True,n_ctx = 4096, temperature = 0.1, max_tokens = 4096
    )
    return llm

In [110]:
tools = load_tools(["searx-search"], searx_host="http://localhost:8888", llm=load_llm())

llama.cpp: loading model from /home/sira/sira_project/meta-Llama2/llama-2-7b-chat.ggmlv3.q8_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 4096
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 7 (mostly Q8_0)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 7354.73 MB (+ 2048.00 MB per state)
llama_new_context_with_model: kv s

In [142]:
from langchain.tools import BraveSearch
api_key = "BSAv1neIuQOsxqOyy0sEe_ie2zD_n_V"
tool = BraveSearch.from_api_key(api_key=api_key, search_kwargs={"count": 3})

In [143]:
tool.run("obama middle name")

'[{"title": "Obama\'s Middle Name -- My Last Name -- is \'Hussein.\' So?", "link": "https://www.cair.com/cair_in_the_news/obamas-middle-name-my-last-name-is-hussein-so/", "snippet": "I wasn\\u2019t sure whether to laugh or cry a few days back listening to radio talk show host Bill Cunningham repeatedly scream Barack <strong>Obama</strong>\\u2019<strong>s</strong> <strong>middle</strong> <strong>name</strong> \\u2014 my last <strong>name</strong> \\u2014 as if he had anti-Muslim Tourette\\u2019s. \\u201cHussein,\\u201d Cunningham hissed like he was beckoning Satan when shouting the ..."}, {"title": "What\'s up with Obama\'s middle name? - Quora", "link": "https://www.quora.com/Whats-up-with-Obamas-middle-name", "snippet": "Answer (1 of 15): A better question would be, \\u201cWhat\\u2019s up with <strong>Obama</strong>\\u2019s first <strong>name</strong>?\\u201d President Barack Hussein <strong>Obama</strong>\\u2019s father\\u2019s <strong>name</strong> was Barack Hussein <strong>Obama</

In [None]:
tools = load_tools(["google-search"], llm=llm)
agent = initialize_agent(
    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True
)