In [103]:
# ! pip3 install pypdf
# ! pip3 install cohere
# ! pip3 install chromadb
# ! pip3 install typing_extensions

In [1]:
import requests, json
import gradio as gr

In [2]:
model = 'openchat:latest' #You can replace the model name if needed
context = [] 

# Through the Ollama API

In [3]:
def generate(prompt, context, top_k, top_p, temp):
    r = requests.post('http://localhost:11434/api/generate',
                     json={
                         'model': model,
                         'prompt': prompt,
                         'context': context,
                         'options':{
                             'top_k': top_k,
                             'temperature':top_p,
                             'top_p': temp
                         }
                     },
                     stream=False)
    r.raise_for_status()

 
    response = ""  

    for line in r.iter_lines():
        body = json.loads(line)
        response_part = body.get('response', '')
        print(response_part)
        if 'error' in body:
            raise Exception(body['error'])

        response += response_part

        if body.get('done', False):
            context = body.get('context', [])
            return response, context



def chat(input, chat_history, top_k, top_p, temp):

    chat_history = chat_history or []

    global context
    output, context = generate(input, context, top_k, top_p, temp)

    chat_history.append((input, output))

    return chat_history, chat_history
  #the first history in return history, history is meant to update the 
  #chatbot widget, and the second history is meant to update the state 
  #(which is used to maintain conversation history across interactions)


In [4]:
block = gr.Blocks()


with block:

    gr.Markdown("""<h1><center> Jarvis </center></h1>
    """)

    chatbot = gr.Chatbot()
    message = gr.Textbox(placeholder="Type here")

    state = gr.State()
    with gr.Row():
        top_k = gr.Slider(0.0,100.0, label="top_k", value=40, info="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)")
        top_p = gr.Slider(0.0,1.0, label="top_p", value=0.9, info=" Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)")
        temp = gr.Slider(0.0,2.0, label="temperature", value=0.8, info="The temperature of the model. Increasing the temperature will make the model answer more creatively. (Default: 0.8)")


    submit = gr.Button("SEND")

    submit.click(chat, inputs=[message, state, top_k, top_p, temp], outputs=[chatbot, state])


block.launch(debug=True)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


 Hello
!
 I
 am
 here
 to
 help
 you
 with
 any
 questions
 or
 information
 you
 need
.
 Please
 feel
 free
 to
 ask
 anything
 and
 I
 will
 do
 my
 best
 to
 assist
 you
.

 Here
 are
 a
 few
 notable
 Arab
 scientists
 throughout
 history
:




1
.
 Al
-
K
h
aw
ar
iz
mi
 (
c
.
 
7
8
0
 -
 
8
5
0
)
 -
 A
 Pers
ian
 scholar
 who
 made
 significant
 contributions
 to
 mathemat
ics
,
 particularly
 in
 the
 fields
 of
 algebra
 and
 tr
ig
on
ometry
.
 He
 is
 often
 considered
 the
 father
 of
 algebra
.


2
.
 Al
-
R
azi
 (
8
6
5
 -
 
9
2
5
)
 -
 An
 influential
 Arab
 chem
ist
,
 physician
,
 and
 philos
opher
 who
 made
 important
 advance
ments
 in
 chemistry
,
 medicine
,
 and
 pharm
ac
ology
.
 He
 was
 known
 for
 his
 work
 on
 dist
ill
ation
 and
 the
 development
 of
 chemical
 comp
ounds
.


3
.
 Al
-
F
ar
abi
 (
8
7
2
 or
 
8
7
3
 -
 
9
5
0
)
 -
 A
 prominent
 Arab
 philos
opher
,
 log
ician
,
 and
 scientist
 who
 made
 significant
 contributions
 to
 various
 fields
,
 in



# Through LangChain

### Loading a document

### Web page

### PDF

In [7]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader('falcon-paper.pdf')
pages = loader.load()

In [8]:
len(pages)

57

In [9]:
pages = pages[:39]

In [11]:
pages[0].page_content[:100]

'The Falcon Series of Open Language Models\nThe Falcon LLM Team∗\nEbtesam Almazrouei Hamza Alobeidli Ab'

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [78]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0, 
)

In [84]:
from langchain_core.documents.base import Document
# docs = r_splitter.split_documents(pages)
paper_content = ' '.join([p.page_content for p in pages])
print(paper_content[:100])
docs = r_splitter.split_documents([Document(page_content=paper_content)])

The Falcon Series of Open Language Models
The Falcon LLM Team∗
Ebtesam Almazrouei Hamza Alobeidli Ab


In [85]:
print(docs[0])
print(docs[-1])

page_content='The Falcon Series of Open Language Models\nThe Falcon LLM Team∗\nEbtesam Almazrouei Hamza Alobeidli Abdulaziz Alshamsi Alessandro Cappelli\nRuxandra Cojocaru Mérouane Debbah Etienne Go ffinet Daniel Hesslow Julien Launay\nQuentin Malartic Daniele Mazzotta Badreddine Noune Baptiste Pannier Guilherme Penedo\nTechnology Innovation Institute, Abu Dhabi\nhttps://huggingface.co/tiiuae/\nAbstract\nWe introduce the Falcon series: 7B, 40B, and 180B parameters causal decoder-'
page_content='all. By open-sourcing artificial intelligence research and models, we can foster a broader and more\ndiverse community, and benefit from vibrant collaborative e fforts to improve the safety and reliability\nof large language models. We hope the Falcon series can be a small step towards this vision.\n39'


### embedding model

In [86]:
EMBEDDING_MODEL_NAME = 'sentence-transformers/all-MiniLM-L12-v2'
# EMBEDDING_MODEL_NAME = 'OrdalieTech/Solon-embeddings-large-0.1'

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding

print(f"Loading embeddings model: {EMBEDDING_MODEL_NAME} ...")

embedding_model = LangchainEmbedding(HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    encode_kwargs = {"normalize_embeddings": False}
  )
)

Loading embeddings model: sentence-transformers/all-MiniLM-L12-v2 ...


### llm

In [87]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import Ollama

llm = Ollama(model="llama2")

llm = Ollama(
    model="llama2",
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
)

In [57]:
_ = llm("Tell me about the history of AI")


Artificial intelligence (AI) has a rich and varied history that spans several decades. Here is a brief overview of some of the key milestones in the development of AI:

1. 1950s: The Dartmouth Conference: The field of AI was founded at a conference held at Dartmouth College in 1956. Attendees included computer scientists, mathematicians, and cognitive scientists who were interested in exploring the possibilities of creating machines that could simulate human intelligence.
2. 1951: The Turing Test: British mathematician Alan Turing proposed a test to measure a machine's ability to exhibit intelligent behavior equivalent to, or indistinguishable from, that of a human. The Turing Test has since become a benchmark for measuring the success of AI systems.
3. 1956: The First AI Program: Computer scientist John McCarthy created the first AI program, called the Logical Theorist, which was designed to reason and solve problems using logical deduction.
4. 1960s: Rule-Based Expert Systems: The d

### llama-index

In [58]:
from llama_index import Document

document = Document(text=paper_content)

In [59]:
from llama_index import VectorStoreIndex
from llama_index import ServiceContext

service_context = ServiceContext.from_defaults(
    llm=llm, embed_model=embedding_model
)
index = VectorStoreIndex.from_documents([document],
                                        service_context=service_context)

In [60]:
query_engine = index.as_query_engine()

In [63]:
response = query_engine.query("What about Hardware scalability?")

In the Falcon series of models, we focused primarily on hardware scalability across three axes: performance, data, and hardware. Large-scale training requires thousands of hardware accelerators to work efficiently in unison; making the best use of these accelerators requires principled distributed training methods. Methods that are able to best run efficiently and leverage large-scale compute are often the ones that gain the most traction in the community, as best evidenced by the Transformer architecture itself. Furthermore, it is difficult to find architectural improvements that significantly improve the task performance of models, compared to the impact of data for instance. Accordingly, we focus architectural decisions not on improving task performance, but on improving hardware scalability and throughput.

In [93]:
from langchain.vectorstores import Chroma
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

docs = r_splitter.split_documents([Document(page_content=paper_content)])
vectorstore = Chroma.from_documents(documents=docs, embedding=embedding_function)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [96]:
query = "What about Hardware scalability?"
docs = vectorstore.similarity_search(query)

# print results
print(docs[0].page_content)

Hardware scalability. Large-scale training requires thousands of hardware accelerators to work
efficiently in unison; making the best use of these accelerators requires in turn principled distributed
training methods (Shoeybi et al., 2019). Methods that are able to best run e fficiently and leverage
large-scale compute are often the ones that gain the most traction in the community (Hooker, 2021),
as best evidenced by the Transformer architecture itself (Vaswani et al., 2017). Furthermore, it


In [100]:
from langchain.chains import RetrievalQA
qachain=RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever(), verbose=True)
qachain({"query": query})



[1m> Entering new RetrievalQA chain...[0m
Based on the provided context, it seems that hardware scalability is a crucial aspect of the Falcon series of models. The authors emphasize the importance of leveraging large-scale compute to achieve efficient and effective training methods. They also mention that the best use of hardware accelerators requires principled distributed training methods, which are able to run efficiently and leverage large-scale compute.

In particular, the authors highlight the challenges of improving task performance through architectural improvements alone, as the sustained increase in the scale of large language models can be difficult to achieve without significant investments in hardware scalability and throughput. To address this challenge, they focus on improving hardware scalability and throughput, rather than task performance directly.

Furthermore, the authors mention the adoption of tweaks such as a revised multiquery attention scheme to improve inf

{'query': 'What about Hardware scalability?',
 'result': 'Based on the provided context, it seems that hardware scalability is a crucial aspect of the Falcon series of models. The authors emphasize the importance of leveraging large-scale compute to achieve efficient and effective training methods. They also mention that the best use of hardware accelerators requires principled distributed training methods, which are able to run efficiently and leverage large-scale compute.\n\nIn particular, the authors highlight the challenges of improving task performance through architectural improvements alone, as the sustained increase in the scale of large language models can be difficult to achieve without significant investments in hardware scalability and throughput. To address this challenge, they focus on improving hardware scalability and throughput, rather than task performance directly.\n\nFurthermore, the authors mention the adoption of tweaks such as a revised multiquery attention schem

# Chain it together

In [None]:
# https://medium.com/@shrinath.suresh/implementing-streaming-chatbot-with-langchain-callbacks-a-step-by-step-guide-a527a7d65b8b

In [104]:
import gradio as gr
from typing import Any
from queue import Queue, Empty
from langchain.llms import LlamaCpp
from langchain.callbacks.base import BaseCallbackHandler
from langchain.prompts import PromptTemplate
from threading import Thread

In [105]:
q = Queue()
job_done = object()

In [106]:
class QueueCallback(BaseCallbackHandler):
    """Callback handler for streaming LLM responses to a queue."""

    def __init__(self, q):
        self.q = q

    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
        self.q.put(token)

    def on_llm_end(self, *args, **kwargs: Any) -> None:
        return self.q.empty()

In [108]:
callbacks = [QueueCallback(q)]
template = """Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [109]:
def answer(question):
  def task():
    response = llm(question)
    q.put(job_done)
  
  t = Thread(target=task)
  t.start()

In [112]:
llm.callbacks = callbacks

In [113]:
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        question = history[-1][0]
        print("Question: ", question)
        history[-1][1] = ""
        answer(question=question)
        while True:
          try:
            next_token = q.get(True, timeout=1)
            if next_token is job_done:
              break
            history[-1][1] += next_token
            yield history
          except Empty:
            continue

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
    clear.click(lambda: None, None, chatbot, queue=False)

demo.queue()
demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




Question:  hello


# RAG

In [118]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

In [114]:
template = """You are an smart and helpful assistant of an AI researcher. Given the following context, answer the question:
Context:{context}

Question: {question}
"""

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [117]:
prompt = ChatPromptTemplate.from_template(template)

model = llm

In [121]:
chain = (
    {"context": vectorstore.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
)

In [123]:
def answer_rag(question):
  def task():
    response = chain.invoke(question)
    q.put(job_done)
  
  t = Thread(target=task)
  t.start()

In [124]:
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        question = history[-1][0]
        print("Question: ", question)
        history[-1][1] = ""
        answer_rag(question=question)
        while True:
          try:
            next_token = q.get(True, timeout=1)
            if next_token is job_done:
              break
            history[-1][1] += next_token
            yield history
          except Empty:
            continue

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
    clear.click(lambda: None, None, chatbot, queue=False)

demo.queue()
demo.launch()


Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




Question:  what is hardware scalability ?
Question:  what is special about Falcon?


In [None]:
what is hardware scalability ?