In [1]:
!pip -q install transformers
!pip -q install sentencepiece
!pip -q install accelerate

In [2]:
import torch
print("cuDNN version: ", torch.backends.cudnn.version())


cuDNN version:  8500


In [2]:
!pip list

Package                       Version
----------------------------- ----------
accelerate                    0.23.0
asttokens                     2.4.0
backcall                      0.2.0
backports.functools-lru-cache 1.6.5
boltons                       23.0.0
brotlipy                      0.7.0
certifi                       2023.7.22
cffi                          1.15.1
charset-normalizer            2.0.4
cmake                         3.27.6
comm                          0.1.4
conda                         23.7.4
conda-content-trust           0.1.3
conda-libmamba-solver         23.5.0
conda-package-handling        2.1.0
conda_package_streaming       0.8.0
cryptography                  39.0.1
debugpy                       1.6.7
decorator                     5.1.1
exceptiongroup                1.1.3
executing                     1.2.0
filelock                      3.12.4
fsspec                        2023.9.2
huggingface-hub               0.16.4
idna                          3.4
importl

In [3]:
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

# Hugging Face model_path
model_path = 'psmathur/orca_mini_3b'
tokenizer = LlamaTokenizer.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(
    model_path, torch_dtype=torch.float16, device_map='auto',
)

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Loading checkpoint shards: 100%|██████████| 3/3 [00:20<00:00,  6.87s/it]


In [4]:
def generate_text(system, instruction, input=None):

    if input:
        prompt = f"### System:\n{system}\n\n### User:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
    else:
        prompt = f"### System:\n{system}\n\n### User:\n{instruction}\n\n### Response:\n"

    tokens = tokenizer.encode(prompt)
    tokens = torch.LongTensor(tokens).unsqueeze(0)
    tokens = tokens.to('cuda')

    instance = {'input_ids': tokens,'top_p': 1.0, 'temperature':0.7, 'generate_len': 1024, 'top_k': 50}

    length = len(tokens[0])
    with torch.no_grad():
        rest = model.generate(
            input_ids=tokens,
            max_length=length+instance['generate_len'],
            use_cache=True,
            do_sample=True,
            top_p=instance['top_p'],
            temperature=instance['temperature'],
            top_k=instance['top_k']
        )
    output = rest[0][length:]
    string = tokenizer.decode(output, skip_special_tokens=True)
    return f'[!] Response: {string}'



In [6]:
# Sample Test Instruction Used by Youtuber Sam Witteveen https://www.youtube.com/@samwitteveenai
system = 'You are AI helper'
instruction = 'So is namami gange successful ? '
print(generate_text(system, instruction))

[!] Response: Namami Gange is a government initiative aimed at cleaning the holy river Ganges and its tributaries in India. The initiative involves various stakeholders such as the government, non-governmental organizations, and local communities. As of now, the initiative has been successful in implementing various measures such as building sewage treatment plants, constructing riverfront parks, and promoting water conservation practices to reduce the pollution level of the Ganges river. However, there still remains a long way to go in terms of achieving the ultimate goal of clean and sustainable Ganges river.


In [7]:
!pip install ctransformers[cuda]>=0.2.24

In [10]:
from ctransformers import AutoModelForCausalLM

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GGUF", model_file="mistral-7b-instruct-v0.1.Q5_K_M.gguf", model_type="mistral", gpu_layers=150)



Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 10922.67it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 14315.03it/s]


In [11]:
print(llm("what is namami gange project objective ? answer :"))

 The Namami Ganga Project is a national program launched by the Government of India in 2015 with the aim of cleaning and rejuvenating the River Ganges, also known as the "Mother River" of India. The project's primary objectives are:

1. To restore the ecological balance of the river and its tributaries by removing pollution and waste.
2. To promote sustainable development along the banks of the river through the creation of new job opportunities, economic growth, and improved infrastructure.
3. To enhance the cultural and spiritual significance of the River Ganges by promoting cleanliness and respect for its waters.
4. To foster a sense of national pride and identity among Indians by revitalizing this important natural resource.
5. To promote international cooperation and partnership in conservation efforts related to the river.
