# Подготовка обучающего датасета

In [4]:
import pandas as pd

data = pd.read_csv('Data/generated_data.csv')
pd.concat([data.iloc[:800], data.iloc[1000:1800]]).to_csv('Data/train.csv', index=False)
pd.concat([data.iloc[800:1000], data.iloc[1800:2000]]).to_csv('Data/test.csv', index=False)

In [5]:
tickets_template = """
    - departure_city
        Описание: город отправления на самолете.
        Тип данных: str
    - arrival_city
        Описание: город прибытия на самолете.
        Тип данных: str
    - departure_date
        Описание: дата отправления из города отправления.
        Тип данных: date в формате day-month-year 
    - return_date
        Описание: дата возвращения из города прибытия.
        Тип данных:  date в формате day-month-year
"""

# Шаблон атрибутов для бронирования отелей
booking_template = """
    - city
        Описание: город расположения отеля.
        Тип данных: str
    - hotel
        Описание: название отеля.
        Тип данных: str
    - date
        Описание: дата заселения в отель.
        Тип данных: date в формате day-month-year
    - guests
        Описание: количество гостей для проживания.
        Тип данных: int
    - days
        Описание: количество дней пребывания в отеле.
        Тип данных: int
"""

# маппинг классов с шаблонами
label_mapping = {
    'отель': booking_template,
    'самолет': tickets_template
}

In [6]:
def create_prompt(sample, label_mapping):
    prompt = f"""Ты должен найти в тексте определенные атрибуты и сохранить их в формате JSON. Не используй комментарии.

Используй следующий шаблон:

Текст: сообщение пользователя в чате.
JSON: объект по всем стандартам JSON.{label_mapping[sample.label]}
Начинай!

Текст: {sample.text}
Ответ: {sample.json}"""

    return prompt
    
train_df = pd.concat([data.iloc[:800], data.iloc[1000:1800]])
train_df['Prompt'] = train_df.apply(lambda x: create_prompt(x, label_mapping), axis=1)
train_df.to_csv('Data/train.csv', index=False)

# Train

In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

model_name = 'mistralai/Mistral-7B-v0.1'
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='cuda:0')
tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          padding_side="right",
                                          add_eos_token=True,
                                          add_bos_token=True)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
from datasets import load_dataset
train_dataset = load_dataset('csv', data_files='Data/train.csv', split='train')

Generating train split: 0 examples [00:00, ? examples/s]

In [9]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
# Выбор слоев для Lora
for n, m in model.named_modules():
    print(n)


model
model.embed_tokens
model.layers
model.layers.0
model.layers.0.self_attn
model.layers.0.self_attn.q_proj
model.layers.0.self_attn.k_proj
model.layers.0.self_attn.v_proj
model.layers.0.self_attn.o_proj
model.layers.0.self_attn.rotary_emb
model.layers.0.mlp
model.layers.0.mlp.gate_proj
model.layers.0.mlp.up_proj
model.layers.0.mlp.down_proj
model.layers.0.mlp.act_fn
model.layers.0.input_layernorm
model.layers.0.post_attention_layernorm
model.layers.1
model.layers.1.self_attn
model.layers.1.self_attn.q_proj
model.layers.1.self_attn.k_proj
model.layers.1.self_attn.v_proj
model.layers.1.self_attn.o_proj
model.layers.1.self_attn.rotary_emb
model.layers.1.mlp
model.layers.1.mlp.gate_proj
model.layers.1.mlp.up_proj
model.layers.1.mlp.down_proj
model.layers.1.mlp.act_fn
model.layers.1.input_layernorm
model.layers.1.post_attention_layernorm
model.layers.2
model.layers.2.self_attn
model.layers.2.self_attn.q_proj
model.layers.2.self_attn.k_proj
model.layers.2.self_attn.v_proj
model.layers.2.

In [11]:
from peft import LoraConfig, get_peft_model

model.gradient_checkpointing_enable()
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "lm_head",
        "model.layers.31.self_attn.q_proj",
        "model.layers.31.self_attn.k_proj",
        "model.layers.31.self_attn.v_proj",
        "model.layers.31.self_attn.o_proj",
        "model.layers.31.mlp.gate_proj",
        "model.layers.31.mlp.up_proj",
        "model.layers.31.mlp.down_proj",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

In [12]:
print_trainable_parameters(model)

trainable params: 1888256 || all params: 7243620352 || trainable%: 0.026067848786120368


In [13]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-30): 31 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
              (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
              (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (rotary_emb): MistralRotaryEmbedding()
            )
            (mlp): MistralMLP(
              (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
              (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
              (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
              (act_fn): SiLU()
            )
        

In [14]:
# Выбор токенов для DataCollatorForCompletionOnlyLM
print(tokenizer(['\nОтвет:']))
print(tokenizer.decode([13, 28874, 28786, 8496, 28747]))

{'input_ids': [[1, 28705, 13, 28874, 28786, 8496, 28747, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1]]}

Ответ:


In [15]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import transformers

project = "mistral_v0.1"
base_model_name = "NER"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

training_arguments = transformers.TrainingArguments(
    output_dir=output_dir,
    warmup_steps=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    max_steps=200,
    learning_rate=2.5e-5, # Want a small lr for finetuning
    optim="adamw_bnb_8bit",
    logging_steps=5,              # When to start reporting loss
    logging_dir="./logs",        # Directory for storing logs
    save_strategy="steps",       
    save_steps=200 # Save the model checkpoint every logging step
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    dataset_text_field="Prompt",
    args=training_arguments,
    peft_config=config,
    tokenizer=tokenizer,
    data_collator=DataCollatorForCompletionOnlyLM([13, 28874, 28786, 8496, 28747], tokenizer=tokenizer)
)

model.config.use_cache = False
trainer.train()



Map:   0%|          | 0/1600 [00:00<?, ? examples/s]



Step,Training Loss
5,0.2636
10,0.1534
15,0.1998
20,0.1495
25,0.1218
30,0.2108
35,0.1012
40,0.1319
45,0.203
50,0.158


Checkpoint destination directory ./NER-mistral_v0.1/checkpoint-200 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=200, training_loss=0.10826172441244125, metrics={'train_runtime': 788.0474, 'train_samples_per_second': 1.015, 'train_steps_per_second': 0.254, 'total_flos': 1.3428548188962816e+16, 'train_loss': 0.10826172441244125, 'epoch': 0.5})