<a href="https://colab.research.google.com/github/PhilSad/claire-instruct/blob/main/train_mixtral_claire.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset

In [None]:
!pip install datasets huggingface_hub wandb huggingface_hub datasets transformers peft bitsandbytes accelerate

In [4]:
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
import datasets

In [2]:
import random
def reduce_test(exemple):
  split = random.randint(20, 300)
  exemple['text'] = exemple['orig'][0:split]
  return exemple

In [3]:
dataset = datasets.load_dataset('OpenLLM-France/Claire-Dialogue-French-0.1')
dataset['test'] = dataset['test'].add_column("orig", dataset['test']['text'])
dataset['test'] = dataset['test'].map(reduce_test)
dataset['test'] = dataset['test'].remove_columns(['orig'])


Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

In [4]:
# dataset.push_to_hub('PhilSad/Claire-Dialogue-French-0.1')

In [2]:
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
import torch
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel


tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1",
                                             load_in_4bit=True,
                                             torch_dtype=torch.float16,
                                             device_map="auto",
                                            )


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [6]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

tokenizer.pad_token = "!" #Not EOS, will explain another time.\

CUTOFF_LEN = 1024  #Our dataset has shot text
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=[ "w1", "w2", "w3"],  #just targetting the MoE layers.
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)


In [7]:
# dataset = load_dataset("PhilSad/Claire-Dialogue-French-0.1")
print("dataset", dataset)
train_data = dataset["train"]
test_data  = dataset["test"]

train_data = train_data.shuffle()
test_data = test_data.shuffle()


dataset DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 36731
    })
    test: Dataset({
        features: ['text'],
        num_rows: 284
    })
})


In [8]:
from transformers.integrations import WandbCallback
from transformers import GenerationConfig
from tqdm import tqdm
import wandb
import trl
class LLMSampleCB(WandbCallback):
  def __init__(self, trainer, test_dataset, num_samples=10, max_new_tokens=256, log_model="checkpoint"):
      "A CallBack to log samples a wandb.Table during training"
      super().__init__()
      self._log_model = log_model
      self.sample_dataset = test_dataset.select(range(num_samples))
      self.model, self.tokenizer = trainer.model, trainer.tokenizer
      self.gen_config = GenerationConfig.from_pretrained(trainer.model.name_or_path,
                                                          max_new_tokens=max_new_tokens)
  def generate(self, prompt):
      tokenized_prompt = self.tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
      with torch.inference_mode():
          output = self.model.generate(tokenized_prompt, generation_config=self.gen_config)
      return self.tokenizer.decode(output[0][len(tokenized_prompt[0]):])

  def samples_table(self, examples):
      "Create a wandb.Table to store the generations"
      records_table = wandb.Table(columns=["prompt", "generation"] + list(self.gen_config.to_dict().keys()))
      for example in tqdm(examples, leave=False):
          prompt = example["text"]
          generation = self.generate(prompt=prompt)
          records_table.add_data(prompt, generation, *list(self.gen_config.to_dict().values()))
      return records_table

  def on_evaluate(self, args, state, control,  **kwargs):
      "Log the wandb.Table after calling trainer.evaluate"
      super().on_evaluate(args, state, control, **kwargs)
      records_table = self.samples_table(self.sample_dataset)
      self._wandb.log({"sample_predictions_step_" +str(state.global_step):records_table})


In [9]:
# trainer = Trainer(
#     model=model,
    # train_dataset=train_data,
    # eval_dataset=test_data,
    # args=TrainingArguments(
        # report_to="wandb",
        # bf16=True,
        # per_device_train_batch_size=1,
        # gradient_accumulation_steps=4,
        # num_train_epochs=1,
        # learning_rate=1e-5,
        # optim="adamw_torch",
        # save_strategy="epoch",
        # output_dir="mixtral-lora-claire",
        # push_to_hub=True,
        # save_total_limit=5,
        # logging_strategy="steps",
        # evaluation_strategy="steps",
        # eval_steps = 30,
        # logging_steps=1,
        # save_steps = 30
# 
    # ),
    # data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
# )

trainer = trl.SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    peft_config=config,
    dataset_text_field="text",
    max_seq_length=CUTOFF_LEN,
    tokenizer=tokenizer,
    args=TrainingArguments(
        report_to="wandb",
        bf16=True,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        # gradient_accumulation_steps=4,
        num_train_epochs=1,
        learning_rate=1e-5,
        optim="adamw_torch",
        save_strategy="steps",
        output_dir="mixtral-lora-claire",
        push_to_hub=True,
        save_total_limit=5,
        logging_strategy="steps",
        evaluation_strategy="steps",
        eval_steps = 100,
        logging_steps=1,
        save_steps = 100

    )
)
model.config.use_cache = False



Map:   0%|          | 0/36731 [00:00<?, ? examples/s]

Map:   0%|          | 0/284 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
torch.cuda.empty_cache()

In [10]:
wandb.init(project='mixtral-claire', config = dict(peft_config=config.to_dict()))
wandb_callback = LLMSampleCB(trainer, test_data, num_samples=10, max_new_tokens=256)
trainer.add_callback(wandb_callback)
trainer.train()



Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mphilippe-henri-saade[0m ([33mkollai[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss




Step,Training Loss,Validation Loss
5,2.2447,5.924096
10,2.1358,5.669204
15,2.1356,5.454982


  0%|          | 0/10 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 10%|█         | 1/10 [00:29<04:25, 29.48s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 20%|██        | 2/10 [00:57<03:46, 28.34s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 30%|███       | 3/10 [01:29<03:31, 30.18s/it]The attention mask and the pad token id were not set. As a consequence, you may obs

KeyboardInterrupt: 