In [1]:
!pip install huggingface_hub bitsandbytes einops datasets transformers huggingface_hub accelerate huggingface_hub accelerate trl wandb tqdm peft

Collecting huggingface_hub
  Downloading huggingface_hub-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-2.17.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)
Collecting trl
  Downloading trl-0.7.11-py3-none-any.whl.metadata (10 kB)
Collecting wandb
  Downloading wandb-0.16.3-py3-none-any.whl.metadata (9.9 kB)
Collecting tqdm
  Downloading tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m31.1 

In [2]:
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

import os
import wandb

from transformers.integrations import WandbCallback
from transformers import GenerationConfig
import torch
import tqdm
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training


In [3]:
# load dataset
dataset = load_dataset('PhilSad/Alpaca_french_instruct_sft')
data_train = dataset['train']
data_test = dataset['test']

def map_text_train(row):
  row['text'] = row['prompt'] + row['completion']
  return row
def map_text_test(row):
  row['text'] = row['prompt']
  return row

data_train = data_train.map(map_text_train)
data_test = data_test.map(map_text_test)

Downloading readme:   0%|          | 0.00/485 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/772k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2601 [00:00<?, ? examples/s]

Map:   0%|          | 0/49401 [00:00<?, ? examples/s]

Map:   0%|          | 0/2601 [00:00<?, ? examples/s]

In [4]:
model_name = "OpenLLM-France/Claire-7B-0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
model.config.use_cache = False


config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.89G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token


tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

In [6]:
model = prepare_model_for_kbit_training(model)

lora_alpha = 32 #16
lora_dropout = 0.05 #0.1
lora_rank = 32 #64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

peft_model = get_peft_model(model, peft_config)


In [7]:

class LLMSampleCB(WandbCallback):
  def __init__(self, trainer, test_dataset, num_samples=10, max_new_tokens=256, log_model="checkpoint"):
      "A CallBack to log samples a wandb.Table during training"
      super().__init__()
      self._log_model = log_model
      self.sample_dataset = test_dataset.select(range(num_samples))
      self.model, self.tokenizer = trainer.model, trainer.tokenizer
      self.gen_config = GenerationConfig.from_pretrained(trainer.model.name_or_path,
                                                          max_new_tokens=max_new_tokens)
  def generate(self, prompt):
      tokenized_prompt = self.tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
      with torch.inference_mode():
          output = self.model.generate(tokenized_prompt, generation_config=self.gen_config)
      return self.tokenizer.decode(output[0][len(tokenized_prompt[0]):])

  def samples_table(self, examples):
      "Create a wandb.Table to store the generations"
      records_table = wandb.Table(columns=["prompt", "generation"] + list(self.gen_config.to_dict().keys()))
      for example in tqdm.tqdm(examples, leave=False):
          prompt = example["text"]
          generation = self.generate(prompt=prompt)
          records_table.add_data(prompt, generation, *list(self.gen_config.to_dict().values()))
      return records_table

  def on_evaluate(self, args, state, control,  **kwargs):
      "Log the wandb.Table after calling trainer.evaluate"
      super().on_evaluate(args, state, control, **kwargs)
      records_table = self.samples_table(self.sample_dataset)
      self._wandb.log({"sample_predictions":records_table})


In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [9]:
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
os.environ["WANDB_PROJECT"] = "claire-instruct"

output_dir = "claire-7b-instruct"
per_device_train_batch_size = 4 #4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 300 #100 #500
warmup_ratio = 0.03
lr_scheduler_type = "cosine" #"constant"

training_arguments = TrainingArguments(
    auto_find_batch_size=True,
    report_to="wandb",
    output_dir=output_dir,
    # per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    learning_rate=learning_rate,
    bf16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=True,
    logging_strategy="steps",
    evaluation_strategy="steps",
    eval_steps = 30,
    logging_steps=1,
    save_steps = 30

)

max_seq_length = 512

trainer = SFTTrainer(
    model=peft_model,
    train_dataset=data_train,
    eval_dataset=data_test,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

Map:   0%|          | 0/49401 [00:00<?, ? examples/s]

Map:   0%|          | 0/2601 [00:00<?, ? examples/s]

In [13]:
wandb.init(project='claire-instruct', config = dict(peft_config=peft_config, bnb_config=bnb_config))
peft_model.config.use_cache = False
wandb_callback = LLMSampleCB(trainer, data_test, num_samples=10, max_new_tokens=256)
trainer.add_callback(wandb_callback)

trainer.train()




VBox(children=(Label(value='0.006 MB of 0.020 MB uploaded\r'), FloatProgress(value=0.2976646248085758, max=1.0…

0,1
train/epoch,▁▁▁▁▁▁▁▁▁▁▁▁
train/global_step,▁▁▂▂▄▄▅▅▇▇██
train/learning_rate,▁▁▂▂▄▄▅▅▇▇██
train/loss,██▇▇▅▅▆▆▅▅▁▁

0,1
train/epoch,0.0
train/global_step,6.0
train/learning_rate,0.0002
train/loss,1.5198


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113047134131193, max=1.0…



Step,Training Loss,Validation Loss
30,0.8391,0.642203
60,0.9908,0.636665
90,0.6421,0.612883
120,0.8939,0.601684
150,0.5679,0.635131
180,0.7511,0.589586


  0%|          | 0/10 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
 10%|█         | 1/10 [00:15<02:23, 15.95s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
 20%|██        | 2/10 [00:31<02:03, 15.49s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
 30%|███       | 3/10 [00:46<01:47, 15.41s/it]The attention mask and the pad token id were not set. As a consequence, you may 

OSError: [Errno 28] No space left on device: 'claire-7b-instruct/checkpoint-180/adapter_model.safetensors' -> '/root/.local/share/wandb/artifacts/staging/tmp2h4dr93t'

adapter_model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]



# test models

In [15]:
from transformers import pipeline

In [18]:
from transformers import AutoModelForCausalLM
from peft import PeftModel
import torch
from transformers import AutoTokenizer, pipeline
import transformers

model = AutoModelForCausalLM.from_pretrained(
    'OpenLLM-France/Claire-7B-0.1',
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(
    model,
    "./claire-7b-instruct/checkpoint-150/",
    torch_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained("OpenLLM-France/Claire-7B-0.1")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
# model.config.pad_token_id = tokenizer.eos_token_id
# tokenizer.pad_token_id = 

gen_config = transformers.GenerationConfig(
        do_sample=True,
        temperature=0.7,
        max_new_tokens=256,
        pad_token_id=tokenizer.eos_token_id
    )
pipe = transformers.pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    batch_size=16,
    generation_config=gen_config,
)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonF

In [23]:
prompt = """Ci-dessous se trouve une instruction qui décrit une tâche, associée à une entrée qui fournit un contexte supplémentaire. Écrivez une réponse qui complète de manière appropriée la demande.

### Instruction :
 Rédigez une définition d'un terme spécifique.

### Entrée :
 Traitement du langage naturel

### Réponse :"""

In [24]:
pipe(prompt)

[{'generated_text': "Ci-dessous se trouve une instruction qui décrit une tâche, associée à une entrée qui fournit un contexte supplémentaire. Écrivez une réponse qui complète de manière appropriée la demande.\n\n### Instruction :\n Rédigez une définition d'un terme spécifique.\n\n### Entrée :\n Traitement du langage naturel\n\n### Réponse :\n Le traitement du langage naturel est une technique de reconnaissance du langage naturel (NLU) qui implique l'analyse et la compréhension des phrases et des mots dans les langues naturelles. Il est généralement utilisé dans les systèmes de reconnaissance vocale et dans les interfaces utilisateur conversationnelles. Il implique l'analyse des phrases et des mots pour comprendre la signification du langage humain et pour générer une réponse appropriée. Le traitement du langage naturel est également connu par d'autres noms, tels que la reconnaissance automatique du langage naturel et le traitement du langage naturel.\\n\\nLe traitement du langage natur

# merge and save

In [25]:
merged_model = model.merge_and_unload(progressbar=True)


Unloading and merging model: 100%|██████████| 486/486 [00:54<00:00,  8.97it/s]


In [26]:
help(model.save_pretrained)

Help on method save_pretrained in module peft.peft_model:

save_pretrained(save_directory: 'str', safe_serialization: 'bool' = True, selected_adapters: 'Optional[List[str]]' = None, save_embedding_layers: 'Union[str, bool]' = 'auto', is_main_process: 'bool' = True, **kwargs: 'Any') -> 'None' method of peft.peft_model.PeftModelForCausalLM instance
    This function saves the adapter model and the adapter configuration files to a directory, so that it can be
    reloaded using the [`PeftModel.from_pretrained`] class method, and also used by the [`PeftModel.push_to_hub`]
    method.
    
    Args:
        save_directory (`str`):
            Directory where the adapter model and configuration files will be saved (will be created if it does not
            exist).
        safe_serialization (`bool`, *optional*):
            Whether to save the adapter files in safetensors format, defaults to `True`.
        selected_adapters (`List[str]`,  *optional*):
            A list of adapters to be s

In [28]:
merged_model.save_pretrained(
    "./claire-instruct-merge",
    push_to_hub=True,
    repo_id="PhilSad/Claire-7b-0.1-instruct",
    private=False
)



Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]