In [2]:
import os
os.environ["WANDB_PROJECT"]="instruct_tuninng_mistral_sentiment"

from enum import Enum
from functools import partial
import pandas as pd
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, set_seed
from datasets import load_dataset
from trl import SFTTrainer
from peft import get_peft_model, LoraConfig, TaskType

seed = 42
set_seed(seed)

In [3]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
dataset_name = "FinGPT/fingpt-sentiment-train"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# template = """{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}"""
# tokenizer.chat_template = template



In [4]:
tokenizer.chat_template

"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"

In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
def filter_output(example):
    return example['output'] in ['positive', 'negative', 'neutral']

In [7]:
def add_messages_col(example):
    messages = [
        {'role': 'user', 'content': example['instruction'] + ' '+ example['input']},
        {'role': 'assistant', 'content':example['output']}
    ]
    example['messages'] = messages
    return example

In [8]:
def preprocess(samples):
    batch = []
    for conversation in samples["messages"]:
        batch.append(tokenizer.apply_chat_template(conversation, tokenize=False))
    return {"content": batch}

In [9]:
dataset = load_dataset(dataset_name)

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 76772
    })
})

In [11]:
dataset = dataset.filter(filter_output)

In [12]:
dataset['train']

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 62552
})

In [13]:
dataset = dataset.map(add_messages_col)

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction', 'messages'],
        num_rows: 62552
    })
})

In [15]:
dataset['train']['messages']

[[{'content': 'What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}. Teollisuuden Voima Oyj , the Finnish utility known as TVO , said it shortlisted Mitsubishi Heavy s EU-APWR model along with reactors from Areva , Toshiba Corp. , GE Hitachi Nuclear Energy and Korea Hydro & Nuclear Power Co. .',
   'role': 'user'},
  {'content': 'neutral', 'role': 'assistant'}],
 [{'content': 'What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}. Sanofi poaches AstraZeneca scientist as new research head',
   'role': 'user'},
  {'content': 'neutral', 'role': 'assistant'}],
 [{'content': 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}. $brcm raises revenue forecast',
   'role': 'user'},
  {'content': 'positive', 'role': 'assistant'}],
 [{'content': 'What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}. The Finnish company Stockmann has sign

In [16]:

dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)
dataset = dataset["train"].train_test_split(0.1)
print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 56296
    })
    test: Dataset({
        features: ['content'],
        num_rows: 6256
    })
})
{'content': '<s>[INST] What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}. Via the move , the company aims annual savings of some EUR 3 million USD 4.3 m , the main part of which are expected to be realized this year . [/INST]positive</s>'}


In [17]:
dataset['train'][9]

{'content': '<s>[INST] What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}. Fed Preparing to Purchase New Small Business Payroll Loans https://t.co/iz5xKoWFwC [/INST]neutral</s>'}

In [18]:
peft_config = LoraConfig(r=8,
                         lora_alpha=16,
                         lora_dropout=0.1,
                         target_modules=["gate_proj","q_proj","lm_head","o_proj","k_proj","down_proj","up_proj","v_proj"],
                         task_type=TaskType.CAUSAL_LM)

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                         trust_remote_code = True)
len(tokenizer)

32000

In [20]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'


In [21]:
print(
    f"pad_token: {tokenizer.pad_token}\n, bos_token: {tokenizer.bos_token} \n, eos_token: {tokenizer.eos_token} \n, padding_side: {tokenizer.padding_side} \n, special_tokens: {tokenizer.additional_special_tokens}" )

pad_token: </s>
, bos_token: <s> 
, eos_token: </s> 
, padding_side: right 
, special_tokens: []


In [22]:
len(tokenizer)

32000

In [23]:

model = AutoModelForCausalLM.from_pretrained(model_name)
# model.resize_token_embeddings(len(tokenizer))
model = get_peft_model(model, peft_config)
print(f'trainable parameters: {model.print_trainable_parameters()} ')

# cast non-trainable params in fp16
# for p in model.parameters():
#     if not p.requires_grad:
#         p.data = p.to(torch.float16)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 21,260,288 || all params: 7,262,992,384 || trainable%: 0.2927207805812288
trainable parameters: None 


In [24]:
model.to(torch.bfloat16)
model.cuda()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_feat

In [25]:
print(f' {model.print_trainable_parameters()} ')

trainable params: 21,260,288 || all params: 7,262,992,384 || trainable%: 0.2927207805812288
 None 


In [114]:
model.eval()
messages = [
    {"role": "user", "content": "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}. Via the move , the company aims annual savings of some EUR 3 million USD 4.3 m , the main part of which are expected to be realized this year . "},
]
text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = tokenizer(text, return_tensors="pt")#, add_special_tokens=False)
inputs = {k: v.to("cuda") for k,v in inputs.items()}
outputs = model.generate(**inputs, 
                         max_new_tokens=128, 
                         do_sample=True, 
                         top_p=0.95, 
                         temperature=0.2, 
                         repetition_penalty=1.1, 
                         eos_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


<s><s> [INST] What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}. Via the move , the company aims annual savings of some EUR 3 million USD 4.3 m , the main part of which are expected to be realized this year .  [/INST] The given text is neutral as it only provides information about the company's expected savings without expressing any emotion or opinion.</s>


In [26]:
output_dir = "mistral_sent"
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 8
logging_steps = 5
learning_rate = 5e-4
max_grad_norm = 1.0
max_steps = 250
num_train_epochs=5
warmup_ratio = 0.1
lr_scheduler_type = "cosine"
max_seq_length = 2048

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="no",
    evaluation_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    fp16=False,
    bf16=True,
    report_to=[ "wandb"],
    # hub_private_repo=True,
    # push_to_hub=True,
    num_train_epochs=num_train_epochs,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False}
)


In [27]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    packing=True,
    dataset_text_field="content",
    max_seq_length=max_seq_length,
)

In [28]:
# dataset['train']['content']

In [28]:
# model.train()
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mapollonlp9[0m ([33mapollo-nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss
0,0.6072,0.580244
2,0.309,0.368824
4,0.2066,0.337607


TrainOutput(global_step=1120, training_loss=0.46971641512853757, metrics={'train_runtime': 8926.3217, 'train_samples_per_second': 1.006, 'train_steps_per_second': 0.125, 'total_flos': 7.852278576001843e+17, 'train_loss': 0.46971641512853757, 'epoch': 4.988864142538976})

In [30]:
trainer.save_model()



In [29]:
!nvidia-smi

Tue May 21 17:16:25 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.78                 Driver Version: 551.78         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090      WDDM  |   00000000:01:00.0  On |                  Off |
| 32%   42C    P8             20W /  450W |   20160MiB /  24564MiB |      2%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch

peft_model_id = "mistral_sent"
device = "cuda"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
# model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(model, peft_model_id)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
model.to(torch.bfloat16)
model.cuda()
model.eval()
user_input = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}. The food is very delicious."
messages = [
    {"role": "user", "content": user_input},
]
text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = tokenizer(text, return_tensors="pt")#, add_special_tokens=False)
inputs = {k: v.to("cuda") for k,v in inputs.items()}
outputs = model.generate(**inputs, 
                         max_new_tokens=128, 
                         do_sample=True, 
                         top_p=0.95, 
                         temperature=0.2, 
                         repetition_penalty=1.1, 
                         eos_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0]))
result = tokenizer.decode(outputs[0])
result = result.split('[/INST]')[-1].replace('</s>','')
print('Sentiment is:',result)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


<s><s> [INST] What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}. The food is very delicious. [/INST]positive</s>
Sentiment is: positive
