In [None]:
#####################################################################################
# The MIT License (MIT)
#
# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#####################################################################################

# Llama-2

The following example will show how to run `Llama-2` with `MIGraphX`.

Install the required dependencies.

In [None]:
# Install dependencies
%pip install accelerate huggingface_hub[cli] optimum[onnxruntime] transformers

We will use optimum to generate the onnx files.
But first, we need to login into huggingface to access it

In [None]:
# Please be careful and don't publish your token anywhere
!huggingface-cli login --token YOUR_TOKEN # from https://huggingface.co/settings/tokens

Now we can export the models.

In [None]:
!optimum-cli export onnx --model meta-llama/Llama-2-7b-chat-hf models/llama-2-7b-chat-hf --task text-generation --framework pt --library transformers --no-post-process

Next, it is time to load these models with python.

First, we make sure that MIGraphX module is found in the python path.

In [None]:
import sys
mgx_lib_path = "/opt/rocm/lib/" # or "/code/AMDMIGraphX/build/lib/"
if mgx_lib_path not in sys.path:
    sys.path.append(mgx_lib_path)
import migraphx as mgx

Next, a helper method to load and cache the models.

This will use the `models/llama-2-7b-chat-hf` path. If you changed it, make sure to update here as well.

In [None]:
import os
# helper for model loading
def load_mgx_model(max_seq_len, shapes):
    file = f"models/llama-2-7b-chat-hf/model"
    print(f"Loading {max_seq_len} seq-len version model from {file}")
    if os.path.isfile(f"{file}-{max_seq_len}.mxr"):
        print("Found mxr, loading it...")
        model = mgx.load(f"{file}-{max_seq_len}.mxr", format="msgpack")
    elif os.path.isfile(f"{file}.onnx"):
        print("Parsing from onnx file...")
        model = mgx.parse_onnx(f"{file}.onnx", map_input_dims=shapes)
        model.compile(mgx.get_target("gpu"))
        print("Saving model to mxr file...")
        mgx.save(model, f"{file}-{max_seq_len}.mxr", format="msgpack")
    else:
        print("No model found. Please download it and re-try.")
        sys.exit(1)
    return model

With that, we can load the models. This could take several minutes.

We set the maximum sequence length at load time, if you change it, please reload the model as well.

In [None]:
max_seq_len = 1024
decoder_model = load_mgx_model(
    max_seq_len, {
        "input_ids": [1, max_seq_len],
        "attention_mask": [1, max_seq_len],
        "position_ids": [1, max_seq_len]
    })

Import the remaining packages.

In [None]:
from transformers import LlamaTokenizer
import numpy as np

Time to load the tokenizer from the original source.

In [None]:
model_id = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_id)

Next, we will define all the steps one by one, to make the last step short and simple.

The first step will be to tokenize the user prompt.

In [None]:
def tokenize(prompt):
    return tokenizer(prompt, return_tensors="np").input_ids

Next step will be to convert it to match the model input.

We will generate the attention mask and positions as well.

In [None]:
def get_input_features_for_input_ids(input_ids):
    input_ids_len = len(input_ids[0])
    padding_len = max_seq_len - input_ids_len
    input_ids = np.hstack([input_ids, np.zeros(
        (1, padding_len))]).astype(np.int64)
    # 0 masked | 1 un-masked
    attention_mask = np.array([1] * input_ids_len + [0] * padding_len).astype(
        np.int64)
    attention_mask = attention_mask[np.newaxis]
    position_ids = np.arange(0, max_seq_len, dtype=np.int64)
    position_ids = position_ids[np.newaxis]

    return (input_ids, attention_mask, position_ids)

We will use these in the decoding step.

In [None]:
def decode_step(input_ids, attention_mask, position_ids):
    return np.array(
        decoder_model.run({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "position_ids": position_ids
        })[0])

The generated tokens will be decoded with the tokenizer.

In [None]:
def decode_tokens(generated_tokens):
    return ''.join(tokenizer.decode(generated_tokens,
                                    skip_special_tokens=True))

Finally the text generation part.

With each decoding step, we will get the probabilities for the next token. We greedily get best match, add it to the decoded tokens and unmask it.

If the token is end-of-sequence, we finished with the generation.

In [None]:
from IPython.display import clear_output

def generate(input_ids):
    start_timestep = len(input_ids[0]) - 1
    input_ids, attention_mask, position_ids = get_input_features_for_input_ids(
        input_ids)

    for timestep in range(start_timestep, max_seq_len):
        # get logits for the current timestep
        logits = decode_step(input_ids, attention_mask, position_ids)
        # greedily get the highest probable token
        new_token = np.argmax(logits[0][timestep])

        # add it to the tokens and unmask it
        input_ids[0][timestep + 1] = new_token
        attention_mask[0][timestep + 1] = 1

        decoded_tokens = decode_tokens(input_ids[0][:timestep+2])
        clear_output(wait=True)
        print(decoded_tokens)

        if new_token == tokenizer.eos_token_id:
            break

And now, to put everything together and run the whole pipeline:

In [None]:
prompt = "Where is Szeged?"
input_ids = tokenize(prompt)
generate(input_ids)