In [2]:
# Checking if our gpu is working
!nvidia-smi

# Cuda Compiler
!nvcc --version

Wed Apr  3 11:50:09 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Installing required libraries

In [3]:
# Set Arguments to run the model in GPU
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python numpy --force-reinstall --upgrade --no-cache-dir --verbose

# Install our libraries
!pip install huggingface_hub
!pip install llama-cpp-python
!pip install numpy


Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.58.tar.gz (37.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.4/37.4 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Running command pip subprocess to install build dependencies
  Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
  Collecting scikit-build-core[pyproject]>=0.5.1
    Downloading scikit_build_core-0.8.2-py3-none-any.whl (140 kB)
       ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 140.5/140.5 kB 1.6 MB/s eta 0:00:00
  Collecting exceptiongroup (from scikit-build-core[pyproject]>=0.5.1)
    Downloading exceptiongroup-1.2.0-py3-none-any.whl (16 kB)
  Collecting packaging>=20.9 (from scikit-build-core[pyproject]>=0.5.1)
    Downloading packaging-24.0-py3-none-any.whl (53 kB)
       ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.5/53.5 kB 5.0 MB/s eta 0:00:00
  Collecting tomli>=1.1

### Importing our libraries

In [4]:
# Import from our libraries
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

### Downloading the model

In [5]:
# Select our model from hugging face
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGUF"
model_basename = "llama-2-13b-chat.Q8_0.gguf"

# Download our model from hugging face
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

print("Path:",model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


llama-2-13b-chat.Q8_0.gguf:   0%|          | 0.00/13.8G [00:00<?, ?B/s]

Path: /root/.cache/huggingface/hub/models--TheBloke--Llama-2-13B-chat-GGUF/snapshots/4458acc949de0a9914c3eab623904d4fe999050a/llama-2-13b-chat.Q8_0.gguf


### Load & Setup the LLM model

In [6]:
# Create our model
model = Llama(
    model_path=model_path,
    n_threads=2,
    n_batch=512,
    n_gpu_layers=-1
)

print(model)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /root/.cache/huggingface/hub/models--TheBloke--Llama-2-13B-chat-GGUF/snapshots/4458acc949de0a9914c3eab623904d4fe999050a/llama-2-13b-chat.Q8_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_co

<llama_cpp.llama.Llama object at 0x7d10d42702b0>


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | 
Model metadata: {'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.context_length': '4096', 'general.name': 'LLaMA v2', 'llama.embedding_length': '5120', 'llama.feed_forward_length': '13824', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'llama.attention.head_count': '40', 'tokenizer.ggml.bos_token_id': '1', 'llama.block_count': '40', 'llama.attention.head_count_kv': '40', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '7'}
Using fallback chat format: None


### Prompt & Getting output

In [15]:
# Ask a prompt from the user
prompt = str(input("Write a Prompt: "))
prompt_template=f'''SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully.

USER: {prompt}

ASSISTANT:
'''

# Getting response from our model
response=model(prompt=prompt_template, max_tokens=10000, temperature=0.5, top_p=0.95,
                 top_k=150,)

# Display the output of our LLM model
print("\n\nResponse:")
print(response["choices"][0]["text"])

Write a Prompt: short USA history?


Llama.generate: prefix-match hit

llama_print_timings:        load time =     798.12 ms
llama_print_timings:      sample time =     292.04 ms /   475 runs   (    0.61 ms per token,  1626.47 tokens per second)
llama_print_timings: prompt eval time =     327.44 ms /    10 tokens (   32.74 ms per token,    30.54 tokens per second)
llama_print_timings:        eval time =   33376.42 ms /   474 runs   (   70.41 ms per token,    14.20 tokens per second)
llama_print_timings:       total time =   35555.33 ms /   484 tokens




Response:
Certainly! Here's a brief overview of the history of the United States:

1. Pre-Columbian Era (before 1492): Indigenous peoples inhabited the land that is now the United States for thousands of years. They developed complex societies, cultures, and traditions.
2. European Exploration and Colonization (1492-1776): European explorers, led by Christopher Columbus, arrived in the Americas in 1492. Over time, European powers such as Spain, England, France, and the Netherlands established colonies on the eastern coast of North America. These colonies developed into 13 independent states that would eventually form the United States of America.
3. American Revolution (1775-1783): Tensions between the colonies and Great Britain escalated into the American Revolutionary War, which ended with the signing of the Treaty of Paris in 1783. This treaty recognized the independence of the United States.
4. Early National Period (1783-1828): Following independence, the new nation faced chall