In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from trl import SFTTrainer, setup_chat_format
import pandas as pd

In [2]:
df = pd.read_csv('dataset/translate_many_category_sentiment.csv')

In [3]:
#model_name = "NousResearch/Meta-Llama-3.1-8B-Instruct"
#new_model = "Llama3.1-ruFinGPT"

#model_name = "t-bank-ai/T-lite-instruct-0.1"
#new_model = "T-lite-ruFinGPT"

model_name = "IlyaGusev/saiga_llama3_8b"
new_model = "saiga_llama3_8b-ruFinGPT"

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation="eager"
)

NameError: name 'base_model' is not defined

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model, tokenizer = setup_chat_format(model, tokenizer)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [None]:

"""
def format_chat_template(row):
    row = row[1]
    row_json = [
        {"role": "system", "content": row["instruction"]},
        {"role": "user", "content": row["input"]},
        {"role": "assistant", "content": row["output"]},         
    ]

    return tokenizer.apply_chat_template(row_json, tokenize=False)
"""

def format_chat_template(row):
    row = row[1]
    row_json = [
        {"role": "system", "content": row["instruction"]},
        {"role": "user", "content": row["input"]},
        {"role": "assistant", "content": row["output"]},         
    ]
    res_text = "<|begin_of_text|>"
    for item in row_json:
        res_text += f"<|start_header_id|>{item['role']}<|end_header_id|>{item['content']}<|eot_id|>"
        
    return res_text

df['text'] = [format_chat_template(row) for row in df.iterrows()]
df['text'][2]

In [None]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)
trainer.train()

In [15]:
wandb.finish()
model.config.use_cache = True

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▅▃▁
eval/runtime,█▁▁▃
eval/samples_per_second,▁██▆
eval/steps_per_second,▁██▆
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▃▃▂▃▄▃▄▃▄▃▃▁█▂▂▂▃▄▄▂▃▃▁▃▅▄▃▂▂▇▃▂▃▄▂▂▃▃▃▅
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,▆▃▃▄▅▅█▃▆▆█▁▅▂▂▃▃▃▄▃▄█▁▂▆▇▁▃▃▆▃▂▂▄▃▅▄▃▂▅

0,1
eval/loss,0.67904
eval/runtime,275.664
eval/samples_per_second,5.873
eval/steps_per_second,5.873
total_flos,9.076772549758157e+16
train/epoch,0.99993
train/global_step,7282.0
train/grad_norm,1.61203
train/learning_rate,0.0
train/loss,0.5802


In [16]:
#trainer.model.save_pretrained(new_model)
trainer.save_model()



In [17]:
messages = [
    {
        "role": "system", 
        "content": "Какая тональность у этой новости? Пожалуйста, выберите только один вариант ответа из {сильно негативно/умеренно негативно/слабо негативно/нейтрально/слабо позитивно/умеренно позитивно/сильно позитивно}."
    },   
    {
        "role": "user",
        "content": "Чистая прибыль РусГидро по РСБУ за 1 полугодие выросла на 17%"
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, 
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, 
                   truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=128, 
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Both `max_new_tokens` (=1536) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



умеренно позитивно
