In [None]:
""" 必要なPythonライブラリをインストール """
# https://qiita.com/bostonchou/items/bf4a34dcbaf45828f886
!pip install bitsandbytes==0.43.0
!pip install datasets==2.10.1
!pip install transformers==4.38.2
!pip install peft==0.9.0
!pip install sentencepiece==0.1.99
!pip install -U accelerate==0.28.0
!pip install colorama==0.4.6

# 一般的な処理用
import os
import sys
import json
import warnings
import logging
warnings.filterwarnings("ignore")

# 機械学習・深層学習用
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset, load_from_disk
import transformers, datasets
from peft import PeftModel
from colorama import *

# PEFT (Parameter-Efficient Fine-Tuning) 用
from tqdm import tqdm
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import GenerationConfig
from peft import (
    prepare_model_for_int8_training,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training
)


In [None]:
""" Alpaca日本語データセットをクローン """
!git clone https://github.com/shi3z/alpaca_ja.git


In [None]:
seed = 42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)


In [None]:
# トレーニングデータを生成
def generate_training_data(data_point):
    """
    (1) Goal:
        - This function is used to transform a data point (input and output texts) to tokens that our model can read

    (2) Arguments:
        - data_point: dict, with field "instruction", "input", and "output" which are all str

    (3) Returns:
        - a dict with model's input tokens, attention mask that make our model causal, and corresponding output targets

    (3) Example:
        - If you construct a dict, data_point_1, with field "instruction", "input", and "output" which are all str, you can use the function like this:
            formulate_article(data_point_1)

    """
    # construct full input prompt
    prompt = f"""\
[INST] <<SYS>>
You are a helpful assistant. あなたは役に立つアシスタント。
<</SYS>>

{data_point["instruction"]}
{data_point["input"]}
[/INST]"""
    # count the number of input tokens
    len_user_prompt_tokens = (
        len(
            tokenizer(
                prompt,
                truncation=True,
                max_length=CUTOFF_LEN + 1,
                padding="max_length",
            )["input_ids"]
        ) - 1
    )
    # transform input prompt into tokens
    full_tokens = tokenizer(
        prompt + " " + data_point["output"] + "</s>",
        truncation=True,
        max_length=CUTOFF_LEN + 1,
        padding="max_length",
    )["input_ids"][:-1]
    return {
        "input_ids": full_tokens,
        "labels": [-100] * len_user_prompt_tokens
        + full_tokens[len_user_prompt_tokens:],
        "attention_mask": [1] * (len(full_tokens)),
    }

# 応答を生成して評価する
def evaluate(instruction, generation_config, max_len, input="", verbose=True):
    """
    (1) Goal:
        - This function is used to get the model's output given input strings

    (2) Arguments:
        - instruction: str, description of what you want model to do
        - generation_config: transformers.GenerationConfig object, to specify decoding parameters relating to model inference
        - max_len: int, max length of model's output
        - input: str, input string the model needs to solve the instruction, default is "" (no input)
        - verbose: bool, whether to print the mode's output, default is True

    (3) Returns:
        - output: str, the mode's response according to the instruction and the input

    (3) Example:
        - If you the instruction is "ABC" and the input is "DEF" and you want model to give an answer under 128 tokens, you can use the function like this:
            evaluate(instruction="ABC", generation_config=generation_config, max_len=128, input="DEF")

    """
    # construct full input prompt
    prompt = f"""\
[INST] <<SYS>>
You are a helpful assistant and good at conversation.あなたは役に立つアシスタント、日常会話をするのが得意なアシスタントです。
<</SYS>>

{instruction}
{input}
[/INST]"""
    # プロンプトをモデルが必要な数値表現に変換
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    # 結果を生成
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=max_len,
    )
    # 生成されたレスポンスをデコードしてプリントアウト
    for s in generation_output.sequences:
        output = tokenizer.decode(s, skip_special_tokens=True)
        output = output.split("[/INST]")[1].replace("</s>", "").replace("<s>", "").replace("Assistant:", "").replace("Assistant", "").strip()
        if verbose:
            print(output)

    return output


In [None]:
"""メタのLlama3-8B-Instructバージョンを使用 """
huggingface_token = "hf_xxxxx"  # 自分のhugging face のReadキーに替えてください

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

cache_dir = "./cache"

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

# 事前学習された言語モデルをロード
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    quantization_config=nf4_config,
    low_cpu_mem_usage=True,
    use_auth_token=huggingface_token
)

# トークナイザを初期化し、終了記号を設定(eos_token)
logging.getLogger('transformers').setLevel(logging.ERROR)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    add_eos_token=True,
    cache_dir=cache_dir,
    quantization_config=nf4_config,
    use_auth_token=huggingface_token
)
tokenizer.pad_token = tokenizer.eos_token

# モデル推論時に使用するデコーディングパラメータを設定
max_len = 128
generation_config = GenerationConfig(
    do_sample=True,
    temperature=0.1,
    num_beams=1,
    top_p=0.3,
    no_repeat_ngram_size=3,
    # pad_token_id=2,
    pad_token_id=tokenizer.eos_token_id,  # 終了記号を設定
    eos_token_id=tokenizer.eos_token_id,  # 添加终止符
    max_length=128  # 最大生成长度
)




In [None]:
# 日常会話の日本語のサンプル
test_japanese_list = ['こんにちは、元気ですか？', 'お名前は何ですか？', '今日はどんなことがありましたか？']

# 各サンプルに対してモデルの出力を取得
demo_before_finetune = []
for japanese in test_japanese_list:
    demo_before_finetune.append(f'モデル入力:\n以下は日本語の会話です。会話の続きを生成してください。{japanese}\n\nモデル出力:\n' + evaluate('以下は日本語の会話です。会話の続きを生成してください。', generation_config, max_len, japanese, verbose=False))

# 出力結果を表示して保存
for idx in range(len(demo_before_finetune)):
    print(f"Example {idx + 1}:")
    print(demo_before_finetune[idx])
    print("-" * 80)
