In [1]:
!pip install huggingface_hub bitsandbytes einops datasets transformers huggingface_hub accelerate huggingface_hub accelerate trl wandb tqdm peft

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

import os
import wandb

from transformers.integrations import WandbCallback
from transformers import GenerationConfig
import torch
import tqdm
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training


In [2]:
# load dataset
dataset = load_dataset('PhilSad/Alpaca_french_instruct_sft')
data_train = dataset['train']
data_test = dataset['test']

def map_text_train(row):
  row['text'] = row['prompt'] + row['completion']
  return row
def map_text_test(row):
  row['text'] = row['prompt']
  return row

data_train = data_train.map(map_text_train)
data_test = data_test.map(map_text_test)

In [4]:
model_name = "OpenLLM-France/Claire-7B-0.1"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map={'': 0},
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)
# model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token


In [6]:
# model = prepare_model_for_kbit_training(model)

lora_alpha = 16 #16
lora_dropout = 0.05 #0.1
lora_rank = 8 #64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

peft_model = get_peft_model(model, peft_config)


In [7]:

class LLMSampleCB(WandbCallback):
  def __init__(self, trainer, test_dataset, num_samples=10, max_new_tokens=256, log_model="checkpoint"):
      "A CallBack to log samples a wandb.Table during training"
      super().__init__()
      self._log_model = log_model
      self.sample_dataset = test_dataset.select(range(num_samples))
      self.model, self.tokenizer = trainer.model, trainer.tokenizer
      self.gen_config = GenerationConfig.from_pretrained(trainer.model.name_or_path,
                                                          max_new_tokens=max_new_tokens)
  def generate(self, prompt):
      tokenized_prompt = self.tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
      with torch.inference_mode():
          output = self.model.generate(tokenized_prompt, generation_config=self.gen_config)
      return self.tokenizer.decode(output[0][len(tokenized_prompt[0]):])

  def samples_table(self, examples):
      "Create a wandb.Table to store the generations"
      records_table = wandb.Table(columns=["prompt", "generation"] + list(self.gen_config.to_dict().keys()))
      for example in tqdm.tqdm(examples, leave=False):
          prompt = example["text"]
          generation = self.generate(prompt=prompt)
          records_table.add_data(prompt, generation, *list(self.gen_config.to_dict().values()))
      return records_table

  def on_evaluate(self, args, state, control,  **kwargs):
      "Log the wandb.Table after calling trainer.evaluate"
      super().on_evaluate(args, state, control, **kwargs)
      records_table = self.samples_table(self.sample_dataset)
      self._wandb.log({"sample_predictions":records_table})


In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [8]:
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
os.environ["WANDB_PROJECT"] = "claire-instruct"

output_dir = "claire-7b-instruct-bf16-peft-rank8-1000"
per_device_train_batch_size = 4 #4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 1000 #100 #500
warmup_ratio = 0.03
lr_scheduler_type = "cosine" #"constant"

training_arguments = TrainingArguments(
    auto_find_batch_size=True,
    report_to="wandb",
    output_dir=output_dir,
    # per_device_train_batch_size=per_device_train_batch_size,
    # per_device_eval_batch_size=per_device_train_batch_size * 2,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    learning_rate=learning_rate,
    bf16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=True,
    logging_strategy="steps",
    evaluation_strategy="steps",
    eval_steps = 100,
    logging_steps=1,
    save_steps = 100

)

max_seq_length = 512

trainer = SFTTrainer(
    model=peft_model,
    train_dataset=data_train,
    eval_dataset=data_test,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [10]:
wandb.init(project='claire-7b-instruct-bf16-peft-rank8-1000', config = dict(peft_config=peft_config))
peft_model.config.use_cache = False
wandb_callback = LLMSampleCB(trainer, data_test, num_samples=10, max_new_tokens=256)
trainer.add_callback(wandb_callback)

trainer.train()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mphilippe-henri-saade[0m ([33mkollai[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
100,0.8638,0.631
200,0.6103,0.697151


  0%|          | 0/10 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
 10%|█         | 1/10 [00:07<01:07,  7.55s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
 20%|██        | 2/10 [00:11<00:44,  5.52s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
 30%|███       | 3/10 [00:19<00:44,  6.42s/it]The attention mask and the pad token id were not set. As a consequence, you may 

Step,Training Loss,Validation Loss
100,0.8014,0.609889
200,0.9506,0.606252
300,0.7442,0.605945
400,0.4358,0.71199
500,1.1085,0.596697
600,0.8978,0.589716
700,0.6616,0.593908
800,0.4302,0.612256
900,0.8165,0.584514
1000,0.8033,0.584441


  0%|          | 0/10 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
 10%|█         | 1/10 [00:07<01:07,  7.53s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
 20%|██        | 2/10 [00:15<01:00,  7.53s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
 30%|███       | 3/10 [00:22<00:53,  7.58s/it]The attention mask and the pad token id were not set. As a consequence, you may 

TrainOutput(global_step=1000, training_loss=0.8272835172116756, metrics={'train_runtime': 1680.5009, 'train_samples_per_second': 2.38, 'train_steps_per_second': 0.595, 'total_flos': 2.6895115411617024e+16, 'train_loss': 0.8272835172116756, 'epoch': 0.08})

# test models

In [15]:
from transformers import pipeline

In [18]:
from transformers import AutoModelForCausalLM
from peft import PeftModel
import torch
from transformers import AutoTokenizer, pipeline
import transformers

model = AutoModelForCausalLM.from_pretrained(
    'OpenLLM-France/Claire-7B-0.1',
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(
    model,
    "./claire-7b-instruct/checkpoint-150/",
    torch_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained("OpenLLM-France/Claire-7B-0.1")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
# model.config.pad_token_id = tokenizer.eos_token_id
# tokenizer.pad_token_id = 

gen_config = transformers.GenerationConfig(
        do_sample=True,
        temperature=0.7,
        max_new_tokens=256,
        pad_token_id=tokenizer.eos_token_id
    )
pipe = transformers.pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    batch_size=16,
    generation_config=gen_config,
)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonF

In [23]:
prompt = """Ci-dessous se trouve une instruction qui décrit une tâche, associée à une entrée qui fournit un contexte supplémentaire. Écrivez une réponse qui complète de manière appropriée la demande.

### Instruction :
 Rédigez une définition d'un terme spécifique.

### Entrée :
 Traitement du langage naturel

### Réponse :"""

In [24]:
pipe(prompt)

[{'generated_text': "Ci-dessous se trouve une instruction qui décrit une tâche, associée à une entrée qui fournit un contexte supplémentaire. Écrivez une réponse qui complète de manière appropriée la demande.\n\n### Instruction :\n Rédigez une définition d'un terme spécifique.\n\n### Entrée :\n Traitement du langage naturel\n\n### Réponse :\n Le traitement du langage naturel est une technique de reconnaissance du langage naturel (NLU) qui implique l'analyse et la compréhension des phrases et des mots dans les langues naturelles. Il est généralement utilisé dans les systèmes de reconnaissance vocale et dans les interfaces utilisateur conversationnelles. Il implique l'analyse des phrases et des mots pour comprendre la signification du langage humain et pour générer une réponse appropriée. Le traitement du langage naturel est également connu par d'autres noms, tels que la reconnaissance automatique du langage naturel et le traitement du langage naturel.\\n\\nLe traitement du langage natur

# merge and save

In [25]:
merged_model = model.merge_and_unload(progressbar=True)


Unloading and merging model: 100%|██████████| 486/486 [00:54<00:00,  8.97it/s]


In [26]:
help(model.save_pretrained)

Help on method save_pretrained in module peft.peft_model:

save_pretrained(save_directory: 'str', safe_serialization: 'bool' = True, selected_adapters: 'Optional[List[str]]' = None, save_embedding_layers: 'Union[str, bool]' = 'auto', is_main_process: 'bool' = True, **kwargs: 'Any') -> 'None' method of peft.peft_model.PeftModelForCausalLM instance
    This function saves the adapter model and the adapter configuration files to a directory, so that it can be
    reloaded using the [`PeftModel.from_pretrained`] class method, and also used by the [`PeftModel.push_to_hub`]
    method.
    
    Args:
        save_directory (`str`):
            Directory where the adapter model and configuration files will be saved (will be created if it does not
            exist).
        safe_serialization (`bool`, *optional*):
            Whether to save the adapter files in safetensors format, defaults to `True`.
        selected_adapters (`List[str]`,  *optional*):
            A list of adapters to be s

In [28]:
merged_model.save_pretrained(
    "./claire-instruct-merge",
    push_to_hub=True,
    repo_id="PhilSad/Claire-7b-0.1-instruct",
    private=False
)



Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]