In [36]:
# Intro to llama 2 models
#
# Transformer experiments
# Inspiration from Huggingfaces site,
# and inspiration from Yash Agrawals, Oppasource, github code
# https://github.com/oppasource/ycopie/blob/main/LLM_Series/Runnung_Llama_2_7b_locally/Llama_2_7b_local.ipynb

# February 26, 2024. Sila

In [2]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
!pip install huggingface_hub

from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [33]:
# Access on Meta
# https://llama.meta.com/llama-downloads/
#
# Access on Huggingface
# https://huggingface.co/meta-llama/Llama-2-7b-hf
#
# Your request to access this repo has been successfully submitted, and is pending a review from the repo's authors.

# Still: Following code from:
# https://github.com/oppasource/ycopie/blob/main/LLM_Series/Runnung_Llama_2_7b_locally/Llama_2_7b_local.ipynb

# In order to have access to the gated model.
# And not receive error like:
# GatedRepoError                            Traceback (most recent call last)

In [5]:
!ls

sample_data


In [6]:
!pip install accelerate



In [7]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    cache_dir="/data/sila/base_models",
    device_map='auto'
)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf",
                                           cache_dir="/data/sila/base_models")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [8]:
inputs = tokenizer("She is", return_tensors="pt").to(device)

In [9]:
inputs

{'input_ids': tensor([[   1, 2296,  338]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1]], device='cuda:0')}

In [10]:
outputs = model.generate(**inputs, max_new_tokens=10)

In [11]:
outputs

tensor([[    1,  2296,   338,   263,  9560,  6114, 29889,  2296,   338,   884,
           263,  5637, 29892]], device='cuda:0')

In [12]:
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [13]:
response

'She is a beautiful woman. She is also a mother,'

In [14]:
def get_llama2_reponse(prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, temperature= 0.00001)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [15]:
prompt = "She is"
get_llama2_reponse(prompt, max_new_tokens=50)

'She is a 2016 graduate of the University of North Carolina at Chapel Hill, where she majored in English and minored in Creative Writing. She is currently a graduate student at the University of North Carolina at Chapel Hill,'

In [17]:
# That answer in cell above took 10 minutes to calculate...
# With GPU accelaration on colab

In [18]:
prompt = "Q:What is the population of Aarhus A:"
get_llama2_reponse(prompt, max_new_tokens=10)

'Q:What is the population of Aarhus A:The population of Aarhus is 33'

In [19]:
# Got the answer after 1 min 45 sec. And well...

In [20]:
prompt = "Q:What is the population of Aarhus A:"
get_llama2_reponse(prompt, max_new_tokens=20)

'Q:What is the population of Aarhus A:The population of Aarhus is 330,000.\nQ:What'

In [21]:
# Much better. Took 3 min and 25 sec.

In [22]:
prompt='''python code to calculate first 10 prime numbers'''
print(get_llama2_reponse(prompt, max_new_tokens=100))

python code to calculate first 10 prime numbers

import math

def is_prime(n):
    if n == 1:
        return False
    if n % 2 == 0:
        return False
    if n % 3 == 0:
        return False
    if n % 5 == 0:
        return False
    if n % 7 == 0:
        return False
    if n % 11 == 0:
        return


In [25]:
# Took 17 minutes. Well, ok. But a little weird code, I would say.

In [24]:
prompt='''python metod to decide whether a number is prime'''
print(get_llama2_reponse(prompt, max_new_tokens=100))

python metod to decide whether a number is prime or not

def is_prime(n):
    if n == 1:
        return False
    elif n == 2:
        return True
    else:
        for i in range(3, int(math.sqrt(n))+1, 2):
            if n % i == 0:
                return False
        return True

print(is_prime(10))
\end{code}

Comment


In [27]:
# Took 18 minutes. More as expected. Great.

In [30]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# hardware on colab
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-93ad822d-8532-9707-3ddd-ed5aaf32eb6f)


In [31]:
# Tesla T4 price: 19.438,00 kr. Or $1.500 dollars.

In [34]:
prompt='''how does llama compare to gpt 3.5'''
print(get_llama2_reponse(prompt, max_new_tokens=50))

how does llama compare to gpt 3.5
The LLMs are trained on large datasets of text, and they can generate text that is similar to human-written text.
The LLMs are trained on large datasets of text, and they can generate text that is similar to human
