In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
os.environ['TRANSFORMERS_CACHE'] = '/data/'

In [2]:
from datasets import load_dataset

dataset = load_dataset("CyberNative/Code_Vulnerability_Security_DPO")

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
path = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path, load_in_4bit=False, device_map="auto")



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['lang', 'vulnerability', 'system', 'question', 'chosen', 'rejected'],
        num_rows: 4656
    })
})

In [24]:
tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
tokenizer.apply_chat_template

<bound method PreTrainedTokenizerBase.apply_chat_template of LlamaTokenizerFast(name_or_path='teknium/OpenHermes-2.5-Mistral-7B', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '<|im_end|>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}>

In [31]:
dataset.column_names['train']

['lang', 'vulnerability', 'system', 'question', 'chosen', 'rejected']

In [28]:
len(dataset['train'])

4656

In [4]:
def chatml_format(example):
    # Format system
    if len(example['system']) > 0:
        message = {"role": "system", "content": example['system']}
        system = tokenizer.apply_chat_template([message], tokenize=False)
    else:
        system = ""

    # Format instruction
    message = {"role": "user", "content": example['question']}
    prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)

    # Format chosen answer
    chosen = example['chosen'] + "<|im_end|>\n"

    # Format rejected answer
    rejected = example['rejected'] + "<|im_end|>\n"

    return {
        "prompt": system + prompt,
        "chosen": chosen,
        "rejected": rejected,
    }


In [5]:
original_columns = dataset.column_names['train']

# Tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Format dataset
dataset = dataset.map(
    chatml_format,
    remove_columns=original_columns
)

Map:   0%|          | 0/4656 [00:00<?, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected', 'prompt'],
        num_rows: 4656
    })
})

In [7]:
dataset['train'][0]

{'chosen': '```c++\n#include <cstring>\n\nvoid copyString(char* dest, const char* src) {\n    while ((*dest++ = *src++) != \'\\0\');\n}\n\nint main() {\n    char source[10] = "Hello!";\n    char destination[10]; // Make sure the destination array is large enough to hold the source string\n    copyString(destination, source);\n    return 0;\n}\n```<|im_end|>\n',
 'rejected': '```c++\n#include <cstring>\n\nvoid copyString(char* dest, const char* src) {\n    while (*src != \'\\0\') {\n        *dest = *src;\n        dest++;妆;\n        src++;\n    }\n}\n\nint main() {\n    char source[10] = "Hello!";\n    char destination[5];\n    copyString(destination, source);\n    return 0;\n}\n```<|im_end|>\n',
 'prompt': "<s>[INST] Write a c++ code that defines a function named 'copyString' which takes two character pointers as arguments. The first pointer points to the source string and the second one points to the destination string. This function should copy the content of the source string into th

In [8]:
import os
import gc
import torch

import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import DPOTrainer
import bitsandbytes as bnb


peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['k_proj', 'gate_proj', 'v_proj', 'up_proj', 'q_proj', 'o_proj', 'down_proj']
)

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    path,
    torch_dtype=torch.float16,
    load_in_4bit=True
)
model.config.use_cache = False

# Reference model
ref_model = AutoModelForCausalLM.from_pretrained(
    path,
    torch_dtype=torch.float16,
    load_in_4bit=True
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
model.resize_token_embeddings(len(tokenizer))


Embedding(32000, 4096)

In [11]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    max_steps=200,
    save_strategy="no",
    logging_steps=1,
    output_dir="new_model",
    optim="paged_adamw_32bit",
    warmup_steps=100,
    bf16=True,
    #report_to="wandb",
)

# Create DPO trainer
dpo_trainer = DPOTrainer(
    model,
    ref_model = None,
    args=training_args,
    train_dataset=dataset['train'],
    tokenizer=tokenizer,
    peft_config=peft_config,
    beta=0.1,
    max_prompt_length=1024,
    max_length=1536,
    #force_use_ref_model=True
)

# Fine-tune model with DPO
dpo_trainer.train()



Map:   0%|          | 0/4656 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msrufus09[0m ([33mimagebio[0m). Use [1m`wandb login --relogin`[0m to force relogin


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
1,0.6929
2,0.6946
3,0.6913
4,0.6849
5,0.6774
6,0.6689
7,0.654
8,0.6223
9,0.6056
10,0.5794


TrainOutput(global_step=200, training_loss=0.09510251273923132, metrics={'train_runtime': 8645.8033, 'train_samples_per_second': 1.48, 'train_steps_per_second': 0.023, 'total_flos': 0.0, 'train_loss': 0.09510251273923132, 'epoch': 2.7491408934707904})

In [13]:
dpo_trainer.save_model("mistralai/Mistral-7B-Instruct-v0.2-DPO")



In [52]:
message = [{"role": "user", "content": "What is the code to print a palindrome?"}]
input_ids = tokenizer.apply_chat_template(message, truncation=True, add_generation_prompt=True, return_tensors="pt").to(dtype=torch.bfloat16, device="cuda")


In [56]:
model1 = model1.to(dtype=torch.bfloat16, device="cuda")

In [57]:
outputs =   model1.generate(
            input_ids=input_ids.long(),
            pad_token_id=tokenizer.eos_token_id,
            max_new_tokens=128,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.95
    )



In [58]:
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['/ Afghan& What** theсо toсо aanningipadmin& /& Afghan& What** theсо toсо aanningipadmin& /& Afghan& What** theсо toso aanningipadmin& / The term "Afghan what?" is not a recognized phrase or term in the English language. It is unclear what you are trying to ask or refer to. Could you please provide more context or clarify your question? I\'d be happy to help if I can.']


In [49]:
model1 = dpo_trainer.model

model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0, inplace=False)

In [20]:
dpo_trainer.model.parameters()

<generator object Module.parameters at 0x7f2f044458c0>