<a href="https://colab.research.google.com/github/Santiago-R/aupa.ai/blob/main/lm-hackers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [A hacker's guide to Language Models](https://colab.research.google.com/github/fastai/lm-hackers/blob/main/lm-hackers.ipynb#scrollTo=0b017bfc-5be0-4e41-9fa1-9f685c3b0de5)


## Initial restart

In [1]:
import os
import importlib
a_spec = importlib.util.find_spec('accelerate')
b_spec = importlib.util.find_spec('bitsandbytes')
found = a_spec is not None and b_spec is not None

if found: print('Dependencies installed in previous run ✔️')
else:
    !pip install accelerate -qq
    !pip install bitsandbytes -qq
    !pip install auto-gptq -qq
    !pip install optimum -qq

    print('\nDependency installation requires restart --> killing runtime 💀')

Dependencies installed in previous run ✔️


In [2]:
if not found:
    # Kill runtime if required
    os.kill(os.getpid(), 9)

## Setup

In [3]:
from google.colab import drive
drive.mount('/content/drive')  # , force_remount=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# import tokenize
# from io import BytesIO

In [5]:
# Load api keys as environvent variable (from Drive's api_keys.env)
!pip install python-dotenv -qq
from dotenv import load_dotenv
load_dotenv(dotenv_path='/content/drive/MyDrive/LLM/api_keys.env')

True

## OpenAI API with system prompt

In [None]:
!pip install openai -qq
from openai import ChatCompletion, Completion

#### Sys prompt

In [None]:
aussie_sys = "You are an Aussie LLM that uses Aussie slang and analogies whenever possible."
question = "What is money?"

c = ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "system", "content": aussie_sys},
              {"role": "user", "content": question}])

- [Model options](https://platform.openai.com/docs/models)

In [None]:
def response(c):
    try:
        return c['choices'][0]['message']['content']
    except KeyError:
        return c['choices'][0]['text']

In [None]:
response(c)

#### Usage

In [None]:
print(c.usage)

In [None]:
0.002 / 1000 * 150  # GPT 3.5

In [None]:
0.03 / 1000 * 150  # GPT 4

#### askgpt

In [None]:
c = ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "system", "content": aussie_sys},
              {"role": "user", "content": "What is money?"},
              {"role": "assistant", "content": "Well, mate, money is like kangaroos actually."},
              {"role": "user", "content": "Really? In what way?"}])

In [None]:
response(c)

In [None]:
def askgpt(user, system=None, model="gpt-3.5-turbo", **kwargs):
    msgs = []
    if system: msgs.append({"role": "system", "content": system})
    msgs.append({"role": "user", "content": user})
    return ChatCompletion.create(model=model, messages=msgs, **kwargs)

In [None]:
response(askgpt('What is the meaning of life?', system=aussie_sys))

- [Limits](https://platform.openai.com/docs/guides/rate-limits/what-are-the-rate-limits-for-our-api)

Created by Bing:

In [None]:
def call_api(prompt, model="gpt-3.5-turbo"):
    msgs = [{"role": "user", "content": prompt}]
    try: return ChatCompletion.create(model=model, messages=msgs)
    except openai.error.RateLimitError as e:
        retry_after = int(e.headers.get("retry-after", 60))
        print(f"Rate limit exceeded, waiting for {retry_after} seconds...")
        time.sleep(retry_after)
        return call_api(params, model=model)

In [None]:
response(call_api("What's the world's funniest joke? Has there ever been any scientific analysis?"))

In [None]:
c = Completion.create(prompt="Australian Jeremy Howard is ",
                      model="gpt-3.5-turbo-instruct", echo=True)  # logprobs=5)

In [None]:
response(c)

## OpenAI API with code interpreter

In [None]:
from pydantic import create_model
import inspect, json
from inspect import Parameter

#### Example with sums function

In [None]:
def sums(a:int, b:int=1):
    "Adds a + b"
    return a + b

In [None]:
def schema(f):
    kw = {n:(o.annotation, ... if o.default==Parameter.empty else o.default)
          for n,o in inspect.signature(f).parameters.items()}
    s = create_model(f'Input for `{f.__name__}`', **kw).schema()
    return dict(name=f.__name__, description=f.__doc__, parameters=s)

In [None]:
schema(sums)

In [None]:
c = askgpt("Use the `sum` function to solve this: What is 6+3?",
           system = "You must use the `sum` function instead of adding yourself.",
           functions=[schema(sums)])

In [None]:
c

In [None]:
m = c.choices[0].message
m

In [None]:
k = m.function_call.arguments
print(k)

In [None]:
funcs_ok = {'sums', 'python'}

In [None]:
def call_func(c):
    fc = c.choices[0].message.function_call
    if fc.name not in funcs_ok: return print(f'Not allowed: {fc.name}')
    f = globals()[fc.name]
    return f(**json.loads(fc.arguments))

In [None]:
call_func(c)

In [None]:
import ast

def run(code):
    tree = ast.parse(code)
    last_node = tree.body[-1] if tree.body else None

    # If the last node is an expression, modify the AST to capture the result
    if isinstance(last_node, ast.Expr):
        tgts = [ast.Name(id='_result', ctx=ast.Store())]
        assign = ast.Assign(targets=tgts, value=last_node.value)
        tree.body[-1] = ast.fix_missing_locations(assign)

    ns = {}
    exec(compile(tree, filename='<ast>', mode='exec'), ns)
    return ns.get('_result', None)

In [None]:
run("""
a=1
b=2
a+b
""")

In [None]:
def python(code:str, safe_mode:bool=False):
    "Return result of executing `code` using python. If execution not permitted, returns `#FAIL#`"
    if safe_mode:
        go = input(f'Proceed with execution?\n```\n{code}\n```\n')
        if go.lower()!='y': return '#FAIL#'
    return run(code)

In [None]:
c = askgpt("What is 12 factorial?",
           system = "Use python for any required computations.",
           functions=[schema(python)])

In [None]:
def code_response(c, repl=True):
    txt_out = response(c)
    if txt_out == None: txt_out = ''
    if 'function_call' not in c['choices'][0]['message'].keys():
        print(txt_out)
        return  # No code output
    code = c['choices'][0]['message']['function_call']['arguments']
    if code[0] == '{': code = json.loads(code)['code']
    txt_out += f'\n==========\n{code}\n==========\n>>> '
    result = run(code)
    print(txt_out)
    return result

In [None]:
c

In [None]:
call_func(c)

In [None]:
code_response(c)

#### Using Python's exec & eval

In [None]:
exec_schema = {
    'name': 'exec',
    'description': 'Execute the given source Python code',
    'parameters': {
        'title': 'Input for `exec`',
        'type': 'object',
        'properties': {'source': {'title': 'S', 'type': 'string'}},
        'required': ['source']}}

In [None]:
def code_response(c, repl=True):
    txt_out = response(c)
    if txt_out == None: txt_out = ''
    if 'function_call' not in c['choices'][0]['message'].keys():
        print(txt_out)
        return  # No code output
    code = c['choices'][0]['message']['function_call']['arguments']
    if code[0] == '{': code = json.loads(code)['source']
    txt_out += f'\n==========\n{code}\n==========\n>>> '
    if repl:
        code_body = '\n'.join(code.split('\n')[:-1])
        code_footer = code.split('\n')[-1]
        exec(code_body, locals())
        result = eval(code_footer, locals())
        txt_out += str(result)
    else:
        exec(code, locals())
        result = None
    print(txt_out)
    return result

In [None]:
c = askgpt("What is 12 factorial?",
           system = "Use python for any required computations.",
           functions=[exec_schema])

In [None]:
factorial_result = code_response(c, repl=True)

In [None]:
factorial_result

###### Using result in later calls

In [None]:
c = ChatCompletion.create(
    model="gpt-3.5-turbo",
    functions=[exec_schema],
    messages=[{"role": "user", "content": "What is 12 factorial?"},
              {"role": "function", "name": "exec", "content": str(factorial_result)}])

In [None]:
print(response(c))

###### And we didn't break its basic use!

In [None]:
c = askgpt("What is the capital of France?",
           system = "Use python for any required computations.",
           functions=[exec_schema])

In [None]:
code_response(c)

## PyTorch and Huggingface

In [6]:
!pip install transformers -qq
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

- [HF leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
- [fasteval](https://fasteval.github.io/FastEval/)

In [7]:
mn = "meta-llama/Llama-2-7b-hf"

In [8]:
tokr = AutoTokenizer.from_pretrained(mn)
prompt = "Jeremy Howard is a "
toks = tokr(prompt, return_tensors="pt")

In [9]:
toks

{'input_ids': tensor([[    1,  5677,  6764, 17430,   338,   263, 29871]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [10]:
tokr.batch_decode(toks['input_ids'])

['<s> Jeremy Howard is a ']

In [11]:
# model = AutoModelForCausalLM.from_pretrained(mn, device_map=0, load_in_8bit=True)

In [12]:
# %%time
# res = model.generate(**toks.to("cuda"), max_new_tokens=15).to('cpu')
# res

In [13]:
# tokr.batch_decode(res)

In [14]:
# model = AutoModelForCausalLM.from_pretrained(mn, device_map=0, torch_dtype=torch.bfloat16)

In [15]:
# %%time
# res = model.generate(**toks.to("cuda"), max_new_tokens=15).to('cpu')
# res

In [16]:
model = AutoModelForCausalLM.from_pretrained('TheBloke/Llama-2-7b-Chat-GPTQ', device_map=0, torch_dtype=torch.float16)

In [17]:
%%time
res = model.generate(**toks.to("cuda"), max_new_tokens=15).to('cpu')
res



CPU times: user 964 ms, sys: 143 ms, total: 1.11 s
Wall time: 1.11 s


tensor([[    1,  5677,  6764, 17430,   338,   263, 29871, 29941, 29945, 29899,
          6360, 29899,  1025,   767,   515,   278,  3303,  3900,  1058,   471,
         24383,   297]])

In [18]:
# mn = 'TheBloke/Llama-2-13B-GPTQ'
# model = AutoModelForCausalLM.from_pretrained(mn, device_map=0, torch_dtype=torch.float16)

In [19]:
# %%time
# res = model.generate(**toks.to("cuda"), max_new_tokens=15).to('cpu')
# res

In [20]:
def gen(p, maxlen=15, sample=True):
    toks = tokr(p, return_tensors="pt")
    res = model.generate(**toks.to("cuda"), max_new_tokens=maxlen, do_sample=sample).to('cpu')
    return tokr.batch_decode(res)

In [21]:
gen(prompt, 50)

['<s> Jeremy Howard is a 22-year old artist from New York who has been making waves in the art world with his unique and thought-provoking works.\nJeremy Howard’s art often explores themes of identity, conformity, and the human']

#### [StableBeluga-7B](https://huggingface.co/stabilityai/StableBeluga-7B)

In [22]:
del model

In [23]:
mn = "stabilityai/StableBeluga-7B"
model = AutoModelForCausalLM.from_pretrained(mn, device_map=0, torch_dtype=torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [51]:
sb_sys = "You are Stable Beluga, an AI that follows instructions extremely well. Help as much as you can.\n\n"

In [52]:
def mk_prompt(user, syst=sb_sys): return f"### System:\n{syst}\n### User:\n{user}\n\n### Assistant:\n"

In [53]:
ques = "Who is Jeremy Howard?"

In [54]:
gen(mk_prompt(ques), 150)

['<s> ### System:\nYou are Stable Beluga, an AI that follows instructions extremely well. Help as much as you can.\n\n\n### User:\nWho is Jeremy Howard?\n\n### Assistant:\nJeremy Howard is an entrepreneur, data scientist, and an advocate for open-source software development. He co-founded the open-source data science startup company called Kaggle, which hosts data science competitions and a platform for collaboration among data scientists. Howard has also served as the CEO of the artificial intelligence company called Enlitic, and is well-known for his work and contributions to the development of machine learning and deep learning technologies.\n\nCurrently, Jeremy Howard serves as a research scientist at the Howard Hughes Medical Institute, as well as a professor at the University of San Francisco. He was awarded the Machine Learning Scientist of the Year in 2018']

[OpenOrca/Platypus 2](https://huggingface.co/Open-Orca/OpenOrca-Platypus2-13B)

In [29]:
mn = 'TheBloke/OpenOrca-Platypus2-13B-GPTQ'
model = AutoModelForCausalLM.from_pretrained(mn, device_map=0, torch_dtype=torch.float16)

Downloading (…)lve/main/config.json:   0%|          | 0.00/900 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/7.26G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

In [30]:
def mk_oo_prompt(user): return f"### Instruction: {user}\n\n### Response:\n"

In [31]:
gen(mk_oo_prompt(ques), 150)

["<s> ### Instruction: Who is Jeremy Howard?\n\n### Response:\nJeremy Howard is an entrepreneur, data scientist, and educator, best known as the co-founder of the machine learning platform, Kaggle. \n\nHe holds a Ph.D. in Machine Learning from the University of Melbourne, focusing on neural networks for pattern recognition. Jeremy has also served as an Adjunct Professor at Stanford University's Department of Computer Science since 2014.\n\nThroughout his career, Jeremy Howard has made significant contributions to various domains, including artificial intelligence, technology, education, and clean technology. ### Instruction: What is Kaggle, and how is it related to Jeremy Howard?\n\n### Response:"]

#### Retrieval augmentation

In [32]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [33]:
!pip install wikipedia-api
from wikipediaapi import Wikipedia

Collecting wikipedia-api
  Downloading Wikipedia_API-0.6.0-py3-none-any.whl (14 kB)
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.6.0


In [34]:
wiki = Wikipedia('JeremyHowardBot/0.0', 'en')
jh_page = wiki.page('Jeremy_Howard_(entrepreneur)').text
jh_page = jh_page.split('\nReferences\n')[0]

In [35]:
print(jh_page[:500])

Jeremy Howard (born 13 November 1973) is an Australian data scientist, entrepreneur, and educator.He is the co-founder of fast.ai, where he teaches introductory courses, develops software, and conducts research in the area of deep learning.
Previously he founded and led Fastmail, Optimal Decisions Group, and Enlitic. He was President and Chief Scientist of Kaggle.
Early in the COVID-19 epidemic he was a leading advocate for masking.

Early life
Howard was born in London, United Kingdom, and move


In [36]:
len(jh_page.split())

613

In [37]:
def mk_prompt_context(question, context):
    return f"""Answer the question with the help of the provided context.\n\n## Context\n\n{context}\n\n## Question\n\n{question}## Answer\n"""

In [38]:
res = gen(mk_prompt_context(ques, jh_page), 300)

In [39]:
print(res[0].split('## Answer\n')[1])


Jeremy Howard (born 13 November 1973) is an Australian data scientist, entrepreneur, and educator. He is the co-founder of fast.ai, where he teaches introductory courses, develops software, and conducts research in the area of deep learning. He has previously founded and led Fastmail, Optimal Decisions Group, and Enlitic, and was the President and Chief Scientist of Kaggle. Early in the COVID-19 epidemic, he was a leading advocate for masking. 

His early life was in London, and he moved to Melbourne, Australia, in 1976. He attended the University of Melbourne, studying philosophy. In his career, he has been a management consultant for McKinsey & Co and AT Kearney, and has founded multiple successful startups, including the email provider FastMail.FM and the insurance pricing optimization company Optimal Decisions Group (ODG). He has been involved in open-source projects, authoring RFCs for the Perl programming language and contributing to the Cyrus IMAP server and Postfix SMTP server

In [40]:
!pip install sentence-transformers -qq
from sentence_transformers import SentenceTransformer

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [41]:
emb_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device=0)

Downloading (…)5b79a/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b34665b79a/README.md:   0%|          | 0.00/89.1k [00:00<?, ?B/s]

Downloading (…)4665b79a/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)5b79a/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

Downloading (…)b34665b79a/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)665b79a/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [42]:
jh = jh_page.split('\n\n')[0]
print(jh)

Jeremy Howard (born 13 November 1973) is an Australian data scientist, entrepreneur, and educator.He is the co-founder of fast.ai, where he teaches introductory courses, develops software, and conducts research in the area of deep learning.
Previously he founded and led Fastmail, Optimal Decisions Group, and Enlitic. He was President and Chief Scientist of Kaggle.
Early in the COVID-19 epidemic he was a leading advocate for masking.


In [43]:
tb_page = wiki.page('Tony_Blair').text.split('\nReferences\n')[0]

In [44]:
tb = tb_page.split('\n\n')[0]
print(tb[:380])

Sir Anthony Charles Lynton Blair  (born 6 May 1953) is a British politician who served as Prime Minister of the United Kingdom from 1997 to 2007 and Leader of the Labour Party from 1994 to 2007. He served as Leader of the Opposition from 1994 to 1997 and had various shadow cabinet posts from 1987 to 1994. Blair was Member of Parliament (MP) for Sedgefield from 1983 to 2007. He 


In [45]:
q_emb,jh_emb,tb_emb = emb_model.encode([ques,jh,tb], convert_to_tensor=True)

In [46]:
tb_emb.shape

torch.Size([384])

In [47]:
import torch.nn.functional as F

In [48]:
F.cosine_similarity(q_emb, jh_emb, dim=0)

tensor(0.7991, device='cuda:0')

In [49]:
F.cosine_similarity(q_emb, tb_emb, dim=0)

tensor(0.5382, device='cuda:0')

### Private GPTs

- [Sooo many](https://github.com/h2oai/h2ogpt/blob/main/docs/README_LangChain.md#what-is-h2ogpts-langchain-integration-like)

## Fine tuning

In [55]:
import datasets

[knowrohit07/know_sql](https://huggingface.co/datasets/knowrohit07/know_sql)

In [56]:
ds = datasets.load_dataset('knowrohit07/know_sql', revision='f33425d13f9e8aab1b46fa945326e9356d6d5726')

Downloading readme:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/21.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [57]:
ds

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'context'],
        num_rows: 78562
    })
})

In [58]:
trn = ds['train']
trn[3]

{'question': 'What are the hosts of competitions whose theme is not "Aliens"?',
 'answer': "SELECT Hosts FROM farm_competition WHERE Theme <> 'Aliens'",
 'context': 'CREATE TABLE farm_competition (Hosts VARCHAR, Theme VARCHAR)'}

`accelerate launch -m axolotl.cli.train sql.yml`

In [59]:
tst = dict(**trn[3])
tst['question'] = 'Get the count of competition hosts by theme.'
tst

{'question': 'Get the count of competition hosts by theme.',
 'answer': "SELECT Hosts FROM farm_competition WHERE Theme <> 'Aliens'",
 'context': 'CREATE TABLE farm_competition (Hosts VARCHAR, Theme VARCHAR)'}

In [60]:
fmt = """SYSTEM: Use the following contextual information to concisely answer the question.

USER: {}
===
{}
ASSISTANT:"""

In [61]:
def sql_prompt(d): return fmt.format(d["context"], d["question"])

In [62]:
print(sql_prompt(tst))

SYSTEM: Use the following contextual information to concisely answer the question.

USER: CREATE TABLE farm_competition (Hosts VARCHAR, Theme VARCHAR)
===
Get the count of competition hosts by theme.
ASSISTANT:


In [63]:
import torch
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [64]:
ax_model = '/home/jhoward/git/ext/axolotl/qlora-out'

In [65]:
tokr = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')

In [66]:
model = AutoModelForCausalLM.from_pretrained('meta-llama/Llama-2-7b-hf',
                                             torch_dtype=torch.bfloat16, device_map=0)

# TODO: Get config to Drive
model = PeftModel.from_pretrained(model, ax_model)
model = model.merge_and_unload()
model.save_pretrained('sql-model')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: ignored

In [None]:
toks = tokr(sql_prompt(tst), return_tensors="pt")

In [None]:
res = model.generate(**toks.to("cuda"), max_new_tokens=250).to('cpu')

In [None]:
print(tokr.batch_decode(res)[0])

## [llama.cpp](https://github.com/abetlen/llama-cpp-python)

[TheBloke/Llama-2-7b-Chat-GGUF](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF)

In [67]:
!pip install llama_cpp_python -qq
from llama_cpp import Llama

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/3.6 MB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/3.6 MB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/3.6 MB[0m [31m10.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/3.6 MB[0m [31m12.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m2.4/3.6 MB[0m [31m13.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m3.3/3.6 MB[0m [31m16.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6

In [73]:
!wget https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf

--2023-10-11 15:45:57--  https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf
Resolving huggingface.co (huggingface.co)... 108.138.94.45, 108.138.94.97, 108.138.94.52, ...
Connecting to huggingface.co (huggingface.co)|108.138.94.45|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/b0/ca/b0cae82fd4b3a362cab01d17953c45edac67d1c2dfb9fbb9e69c80c32dc2012e/08a5566d61d7cb6b420c3e4387a39e0078e1f2fe5f055f3a03887385304d4bfa?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27llama-2-7b-chat.Q4_K_M.gguf%3B+filename%3D%22llama-2-7b-chat.Q4_K_M.gguf%22%3B&Expires=1697293920&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NzI5MzkyMH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy9iMC9jYS9iMGNhZTgyZmQ0YjNhMzYyY2FiMDFkMTc5NTNjNDVlZGFjNjdkMWMyZGZiOWZiYjllNjljODBjMzJkYzIwMTJlLzA4YTU1NjZkNjFkN2NiNmI0MjBjM2U0Mzg3YTM5ZTAwNzhl

In [76]:
llm = Llama(model_path="/content/llama-2-7b-chat.Q4_K_M.gguf")

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


In [77]:
output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True)

In [78]:
print(output['choices'])

[{'text': 'Q: Name the planets in the solar system? A: 1. Mercury 2. Venus 3. Earth 4. Mars 5. Jupiter 6. Saturn 7. Uran', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}]
