In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, TextStreamer
from optimum.intel import OVModelForCausalLM
import openvino as ov
import os
import nncf

model_id = 'stabilityai/japanese-stablelm-instruct-gamma-7b'
#model_id = 'stabilityai/japanese-stablelm-base-gamma-7b'
model_vendor, model_name = model_id.split('/')
device = 'GPU'

INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino




### Convert to OV Model and quantized, then save it to local

In [2]:
if not os.path.exists(f'{model_name}/INT4'):
    ov_model=OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, load_in_8bit=False, trust_remote_code=True)
    compressed_model = nncf.compress_weights(ov_model.half()._original_model, mode=nncf.CompressWeightsMode.INT4_ASYM, group_size=128, ratio=0.8)
    os.makedirs(f'{model_name}/INT4')
    ov.save_model(compressed_model, f'{model_name}/INT4/openvino_model.xml')

This architecture : mistral was not validated, only :marian, gpt2, gpt-neo, gpt-bigcode, llama, opt, gpt-neox, blenderbot, bart, pegasus, codegen, bloom, blenderbot-small architectures were validated, use at your own risk.
Framework not specified. Using pt to export the model.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.2.1+cpu
Overriding 1 configuration item(s)
	- use_cache -> True
  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
  if past_key_values_length > 0:
  if seq_len > self.max_seq_len_cached:
  if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
Exception ignored in: <finalize object at 0x20e83323c00; dead>
Traceback (most recent call last):
  File "C:\Python310\lib\weakref.py", line 591, in __call__
    return info.func(*info.args, **(info.kwargs or {}))
  File "C:\Python310\lib\tempfile.py", line 859, in _cleanup
    cls._rmtree(name, ignore_errors=ignore_errors)
  File "C:\Python310\lib\tempfile.py", line 855, in _rmtree
    _shutil.rmtree(name, onerror=onerror)
  File "C:\Python310\lib\shutil.py", line 749, in rmtree
    return _rmtree_unsafe(path, onerror)
  File "C:\Python310\lib\shutil.py", line 619, in _rmtree_unsafe
    o

Output()

INFO:nncf:Statistics of the bitwidth distribution:
+--------------+---------------------------+-----------------------------------+
| Num bits (N) | % all parameters (layers) |    % ratio-defining parameters    |
|              |                           |             (layers)              |
| 8            | 23% (85 / 226)            | 20% (83 / 224)                    |
+--------------+---------------------------+-----------------------------------+
| 4            | 77% (141 / 226)           | 80% (141 / 224)                   |
+--------------+---------------------------+-----------------------------------+


Output()

### Compile the OV model

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

ov_model = OVModelForCausalLM.from_pretrained(
    model_id = f'{model_name}/INT4',
    device=device,
    ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": "./cache"},
    config=AutoConfig.from_pretrained(model_id)
)

Compiling the model to GPU ...


### Test the model

In [4]:
def build_prompt(user_query, inputs="", sep="\n\n### "):
    sys_msg = "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。"
    p = sys_msg
    roles = ["指示", "応答"]
    msgs = [": \n" + user_query, ": "]
    if inputs:
        roles.insert(1, "入力")
        msgs.insert(1, ": \n" + inputs)
    for role, msg in zip(roles, msgs):
        p += sep + role + msg
    return p

# Infer with prompt without any additional input
query = input('質問をいれてください:')
user_inputs = {
    "user_query": query,
    "inputs": ""
}
prompt = build_prompt(**user_inputs)

print(f'** Prompt:\n{prompt}\n-------------------------')
input_tokens = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
response = ov_model.generate(**input_tokens, 
                             pad_token_id=tokenizer.eos_token_id,
                             eos_token_id=tokenizer.eos_token_id,
                             max_new_tokens=300,
                             num_return_sequences=1,
                             temperature=1.0,
                             do_sample=True,
                             top_k=5,
                             top_p=0.90,
                             repetition_penalty=1.2,
                             streamer=streamer)

質問をいれてください: 日本で一番速い電車の名前はなんですか？


** Prompt:
以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。

### 指示: 
日本で一番速い電車の名前はなんですか？

### 応答: 
-------------------------

それは新幹線だよ！


### Model running on Gradio

In [6]:
from threading import Event, Thread
from uuid import uuid4
from typing import List, Tuple
import torch
import gradio as gr
from transformers import (
    AutoTokenizer,
    StoppingCriteria,
    StoppingCriteriaList,
    TextIteratorStreamer,
)

start_message = """\
設定: あなたは親切で、礼儀正しく、誠実なアシスタントです。 常に安全を保ちながら、できるだけ役立つように答えてください。 回答には、有害、非倫理的、人種差別的、性差別的、有毒、危険、または違法なコンテンツを含めてはいけません。 回答は社会的に偏見がなく、本質的に前向きなものであることを確認してください。
質問が意味をなさない場合、または事実に一貫性がない場合は、正しくないことに答えるのではなく、その理由を説明してください。 質問の答えがわからない場合は、誤った情報を共有しないでください。\
"""

model_name = model_id
start_message = "start_message"
current_message_template = "ユーザー: {user}\nシステム: {assistant}"
tokenizer_kwargs = {"add_special_tokens": False}


examples = [
    ["OpenVINOとは何ですか?"],
    ["沖縄県の名産品をいくつかおしえてください。"],
    ["明治時代の作家を５人教えてください。"],
    ["東北地方の県をすべて教えてください。"],
    ["スト２のリュウの必殺技の名前は？"],
    ["90年代に日本で売れたアーティストを５組教えてください。"],
    ["俳句のルールを教えてください。"],
]

max_new_tokens = 256

class StopOnTokens(StoppingCriteria):
    def __init__(self, token_ids):
        self.token_ids = token_ids

    def __call__(
        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
    ) -> bool:
        for stop_id in self.token_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False


def convert_history_to_token(history: List[Tuple[str, str]]):
    if history_template is None:
        messages = [{"role": "system", "content": start_message}]
        for idx, (user_msg, model_msg) in enumerate(history):
            if idx == len(history) - 1 and not model_msg:
                messages.append({"role": "user", "content": user_msg})
                break
            if user_msg:
                messages.append({"role": "user", "content": user_msg})
            if model_msg:
                messages.append({"role": "assistant", "content": model_msg})
        input_token = tok.apply_chat_template(messages,
                                              add_generation_prompt=True,
                                              tokenize=True,
                                              return_tensors="pt")
    else:
        text = start_message + "".join(
            [
                "".join(
                    [
                        history_template.format(
                            num=round, user=item[0], assistant=item[1]
                        )
                    ]
                )
                for round, item in enumerate(history[:-1])
            ]
        )
        text += "".join(
            [
                "".join(
                    [
                        current_message_template.format(
                            num=len(history) + 1,
                            user=history[-1][0],
                            assistant=history[-1][1],
                        )
                    ]
                )
            ]
        )
        input_token = tokenizer(text, return_tensors="pt", **tokenizer_kwargs).input_ids
    return input_token


def user(message, history):
    return "", history + [[message, ""]]


def bot(history, temperature, top_p, top_k, repetition_penalty, conversation_id):
    def build_prompt(user_query, inputs="", sep="\n\n### "):
        sys_msg = ""
        p = sys_msg
        roles = ["指示", "応答"]
        msgs = [": \n" + user_query, ": "]
        if inputs:
            roles.insert(1, "入力")
            msgs.insert(1, ": \n" + inputs)
        for role, msg in zip(roles, msgs):
            p += sep + role + msg
        return p

    # Infer with prompt without any additional input
    user_inputs = {
        "user_query": history[-1][0],
        "inputs": ""
    }
    prompt = build_prompt(**user_inputs)
    


    input_tokens = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
    streamer = TextIteratorStreamer(
        tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True
    )
    

    stream_complete = Event()

    def generate_and_signal_complete():

        global start_time
        ov_model.generate(
            **input_tokens, 
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            max_new_tokens=300,
            num_return_sequences=1,
            temperature=temperature,
            do_sample=True,
            top_k=top_k,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            streamer=streamer
        )
        stream_complete.set()

    def text_processor(partial_text, new_text):
        new_text = new_text.replace("システム:", "")
        partial_text += new_text
        return partial_text

    t1 = Thread(target=generate_and_signal_complete)
    t1.start()

    partial_text = ""
    for new_text in streamer:
        partial_text = text_processor(partial_text, new_text)

        history[-1][1] = partial_text
        yield history


def get_uuid():
    return str(uuid4())


with gr.Blocks(
    theme=gr.themes.Soft(),
    css=".disclaimer {font-variant-caps: all-small-caps;}",
) as demo:
    conversation_id = gr.State(get_uuid)
    gr.Markdown(
        f"""<h1><center>OpenVINO {model_id} Chatbot</center></h1>""")
    chatbot = gr.Chatbot(height=500)
    with gr.Row():
        with gr.Column():
            msg = gr.Textbox(
                label="Chat Message Box",
                placeholder="質問を書いて 'Submit' ボタンを押してください",
                show_label=False,
                container=False,
            )
        with gr.Column():
            with gr.Row():
                submit = gr.Button("Submit")
                clear = gr.Button("Clear")
    with gr.Row():
        with gr.Accordion("Advanced Options:", open=False):
            with gr.Row():
                with gr.Column():
                    with gr.Row():
                        temperature = gr.Slider(
                            label="Temperature",
                            value=0.1,
                            minimum=0.0,
                            maximum=1.0,
                            step=0.1,
                            interactive=True,
                            info="Higher values produce more diverse outputs",
                        )
                with gr.Column():
                    with gr.Row():
                        top_p = gr.Slider(
                            label="Top-p (nucleus sampling)",
                            value=1.0,
                            minimum=0.0,
                            maximum=1,
                            step=0.01,
                            interactive=True,
                            info=(
                                "Sample from the smallest possible set of tokens whose cumulative probability "
                                "exceeds top_p. Set to 1 to disable and sample from all tokens."
                            ),
                        )
                with gr.Column():
                    with gr.Row():
                        top_k = gr.Slider(
                            label="Top-k",
                            value=50,
                            minimum=0.0,
                            maximum=200,
                            step=1,
                            interactive=True,
                            info="Sample from a shortlist of top-k tokens — 0 to disable and sample from all tokens.",
                        )
                with gr.Column():
                    with gr.Row():
                        repetition_penalty = gr.Slider(
                            label="Repetition Penalty",
                            value=1.2,
                            minimum=1.0,
                            maximum=2.0,
                            step=0.1,
                            interactive=True,
                            info="Penalize repetition — 1.0 to disable.",
                        )
    gr.Examples(
        examples, inputs=msg, label="以下の例文をクリックして 'Submit' ボタンを押してください"
    )

    submit_event = msg.submit(
        fn=user,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot],
        queue=False,
    ).then(
        fn=bot,
        inputs=[
            chatbot,
            temperature,
            top_p,
            top_k,
            repetition_penalty,
            conversation_id,
        ],
        outputs=chatbot,
        queue=True,
    )
    submit_click_event = submit.click(
        fn=user,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot],
        queue=False,
    ).then(
        fn=bot,
        inputs=[
            chatbot,
            temperature,
            top_p,
            top_k,
            repetition_penalty,
            conversation_id,
        ],
        outputs=chatbot,
        queue=True,
    )

    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


