In [None]:
!pip install -q -U bitsandbytes
!pip install -q transformers==4.34.0
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install -q trl
!pip install -q wandb

## Функции для обучения

In [None]:
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    LoraConfig,
    TaskType,
)
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model

import torch
import os

from tqdm import tqdm

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import transformers
from transformers import logging
import numpy as np
import json

In [None]:
MAX_EPOCHS = 10
TRAIN_SEED = 102020

LEARNING_RATE = 2e-4
GRADIENT_ACCUMULATION_STEPS = 4
BATCH_SIZE = 1

EVAL_STEPS = 300
PATIENCE = 3 * EVAL_STEPS

## Функции для обучения

In [None]:
def get_model(model_name: str):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name, quantization_config=bnb_config, device_map="auto"
    )

    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)
    return model


def get_tokenizer(model_name: str):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return tokenizer


def get_lora_config(**lora_kwargs):
    default_config = {
        "task_type": "CAUSAL_LM",
        "lora_dropout": 0.1,
        "bias": "none",
        "target_modules": [
            "q_proj",
            "k_proj",
            "v_proj",
            "out_proj",
            "fc_in",
            "fc_out",
            "wte",
        ],
    }
    default_config.update(lora_kwargs)
    config = LoraConfig(**default_config)

    return config


def prepare_dataset(messages, tokenizer, input_chat_template, output_chat_template):
    cur_logging = logging.get_verbosity()
    logging.set_verbosity(transformers.logging.FATAL)

    input_ids = []
    output_ids = []

    for msg in messages:
        msg_input_ids = []
        msg_output_ids = []
        enc = tokenizer.apply_chat_template(msg)
        msg_input_ids = tokenizer.apply_chat_template(
            msg[:-1], chat_template=input_chat_template
        )
        msg_output_ids = tokenizer.apply_chat_template(
            [msg[-1]], chat_template=output_chat_template
        )

        input_ids.append(msg_input_ids + msg_output_ids)
        output_ids.append([-100 for _ in msg_input_ids] + msg_output_ids)

    data = Dataset.from_dict({"input_ids": input_ids, "labels": output_ids})
    logging.set_verbosity(cur_logging)
    return data

In [None]:
from datetime import datetime


def train(model, tokenizer, train_ds, val_ds, output_dir, save_path, run_name):
    trainer = transformers.Trainer(
        model=model,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        callbacks=[
            transformers.EarlyStoppingCallback(early_stopping_patience=PATIENCE)
        ],
        args=transformers.TrainingArguments(
            output_dir=os.path.join(output_dir, run_name),
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
            warmup_steps=len(train_ds) / (GRADIENT_ACCUMULATION_STEPS * BATCH_SIZE),
            num_train_epochs=MAX_EPOCHS,
            learning_rate=LEARNING_RATE,
            fp16=True,
            optim="paged_adamw_8bit",
            evaluation_strategy="steps",
            logging_strategy="steps",
            save_strategy="steps",
            report_to="wandb",
            run_name=f"{run_name}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}",
            logging_steps=EVAL_STEPS,
            save_steps=EVAL_STEPS,
            seed=TRAIN_SEED,
            data_seed=TRAIN_SEED,
            metric_for_best_model="eval_loss",
            load_best_model_at_end=True,
        ),
        data_collator=transformers.DataCollatorForLanguageModeling(
            tokenizer, mlm=False
        ),
    )
    model.config.use_cache = False
    trainer.train()
    model.save_pretrained(os.path.join(save_path, run_name))
    return model

## Обучение моделей

In [None]:
from google.colab import userdata

In [None]:
import wandb

API_KEY = userdata.get("wandb_api_key")
wandb.login(key=API_KEY)

[34m[1mwandb[0m: Currently logged in as: [33malpestova1818[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ROOT_FOLDER = "/content/drive/MyDrive/llm_data/"

In [None]:
MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct"
DATA_PATH = ROOT_FOLDER + "glaive_function_sampled.json"
SAVE_PATH = ROOT_FOLDER + "output"
OUTPUT_DIR = ROOT_FOLDER + "output"

VALID_SIZE = 0.2
SEED = 42

input_chat_template = "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n   {%- else %}\n  {%- if message['role'] == 'function' %}\n{{'### Function response:\\n' + message['content'] + '\\n'}}\n     {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n  {%- endif %}\n      {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n"
output_chat_template = "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n  {{message['content'] + '\\n<|EOT|>\\n'}} \n{%- endfor %}\n \n"

tokenizer = get_tokenizer(MODEL_NAME)

with open(DATA_PATH, "r") as f:
    messages = json.load(f)

In [None]:
def prepare_msg_list_for_train(messages):
    all_messages = []
    for msg in messages:
        for i, item in enumerate(msg):
            if item["role"] == "assistant":
                # create new training example
                all_messages.append(msg[: i + 1])
    return all_messages

In [None]:
def contains_fc(msg):
    for m in msg:
        if m["role"] == "function":
            return True
    return False


sum([contains_fc(msg) for msg in messages[:800]]), sum(
    [contains_fc(msg) for msg in messages[800:]]
)
sum([contains_fc(msg) for msg in messages[:800]]) / 800, sum(
    [contains_fc(msg) for msg in messages[800:]]
) / 200

(0.70625, 0.675)

In [None]:
val_num = int(len(messages) * VALID_SIZE)

In [None]:
train_messages = prepare_msg_list_for_train(messages[:-val_num])
val_messages = prepare_msg_list_for_train(messages[-val_num:])

In [None]:
len(messages), len(train_messages), len(val_messages)

(1000, 2623, 674)

In [None]:
train_dataset = prepare_dataset(
    train_messages, tokenizer, input_chat_template, output_chat_template
)

val_dataset = prepare_dataset(
    val_messages, tokenizer, input_chat_template, output_chat_template
)

In [None]:
logging.set_verbosity(transformers.logging.INFO)

RANK_LIST = [8, 16]
LORA_ALPHA_LIST = [16, 32]

for lora_alpha in LORA_ALPHA_LIST:
    for rank in RANK_LIST:
        print("start", lora_alpha, rank)
        lora_config = get_lora_config(lora_alpha=lora_alpha, r=rank)
        model = get_model(MODEL_NAME)
        model = get_peft_model(model, lora_config)

        run_name = f"deepsick_r={rank}_alpha={lora_alpha}"

        train(
            model=model,
            tokenizer=tokenizer,
            train_ds=train_dataset,
            val_ds=val_dataset,
            output_dir=OUTPUT_DIR,
            save_path=SAVE_PATH,
            run_name=run_name,
        )

## Saving model

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

In [None]:
OUTPUT_DIR = "/content/drive/MyDrive/llm_data/output"
MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct"

peft_model_path = OUTPUT_DIR + "/deepsick_r=8_alpha=16/checkpoint-1200"

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16)

merged_model = PeftModel.from_pretrained(model, peft_model_path)

merged_model = merged_model.merge_and_unload()

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
repo_id = "deepseek-coder-1.3b-function-calling-v1"

In [None]:
merged_model.push_to_hub(repo_id, variant="fp16")

Saving tokenizer and vocab.json

In [3]:
from transformers import AutoTokenizer

repo_id = "pestova/deepseek-coder-1.3b-function-calling-v1"
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct")
tokenizer.push_to_hub(repo_id)

CommitInfo(commit_url='https://huggingface.co/pestova/deepseek-coder-1.3b-function-calling-v1/commit/374dd9c268a6d776ad50843a23ce0ad858edb2c8', commit_message='Upload tokenizer', commit_description='', oid='374dd9c268a6d776ad50843a23ce0ad858edb2c8', pr_url=None, pr_revision=None, pr_num=None)

In [5]:
import json

with open("vocab.json", "w") as f:
    json.dump(tokenizer.vocab, f)

vocab = tokenizer.get_vocab()
vocab.update({f"<[pad{i}]>": i for i in range(32022, 32256)})

with open("vocab.json", "w") as f:
    json.dump(vocab, f)


from huggingface_hub import HfApi

api = HfApi()
api.upload_file(
    path_or_fileobj="vocab.json",
    path_in_repo="vocab.json",
    repo_id="pestova/deepseek-coder-1.3b-function-calling-v1",
)

'https://huggingface.co/pestova/deepseek-coder-1.3b-function-calling-v1/blob/main/vocab.json'