In [1]:
# # You only need to run this once per machine
# !pip install -q -U bitsandbytes
# !pip install -q -U git+https://github.com/huggingface/transformers.git
# !pip install -q -U git+https://github.com/huggingface/peft.git
# !pip install -q -U git+https://github.com/huggingface/accelerate.git
# !pip install -q -U datasets scipy ipywidgets
# !pip install wandb

In [2]:
!pip show accelerate

Name: accelerate
Version: 0.24.0.dev0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: sylvain@huggingface.co
License: Apache
Location: /home/ubuntu/miniconda3/envs/finetune_venv/lib/python3.11/site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, torch
Required-by: peft


In [3]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

### 1. Load Dataset

In [4]:
from datasets import load_dataset
train_dataset = load_dataset("Babelscape/REDFM", language='fr', split='train')
eval_dataset = load_dataset("Babelscape/REDFM", language='fr', split='validation')
test_dataset = load_dataset("Babelscape/REDFM", language='fr', split='test')

### 2. Load Base Model

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# base_model_id = "mistralai/Mistral-7B-v0.1"
base_model_id = "HuggingFaceH4/zephyr-7b-alpha"
# base_model_id = "bofenghuang/vigostral-7b-chat"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRM

In [None]:
def get_relation(example):
    RELATION_NAMES=['pays', 'lieu de naissance', 'conjoint', 'pays de nationalité', 'instance de',
            'capital', 'enfant', 'partage la frontière avec', 'auteur', 'directeur', 'occupation',
              'fondée par', 'ligue', 'appartenant à', 'genre', 'nommé d\'après', 'suit',
                'localisation du siège social', 'membre du casting', 'constructeur',
                  'situé dans ou à côté d\'une étendue d\'eau', 'localisation', 'partie de', 
                  'embouchure du cours d\'eau', 'membre de', 'sport', 'caractères',
                    'participant', 'travail remarquable', 'remplacer', 'frère et sœur', 'création']
    relations = []
    for relation in example['relations']:
        object = relation['object']['surfaceform']
        subject = relation['subject']['surfaceform']
        predicate = RELATION_NAMES[relation['predicate']]
        relations.append(f"[’{subject}’, ’{predicate}’, ’{object}’]")

 
    return ' | '.join(relations)

### 3. Tokenization

Set up the tokenizer. Add padding on the left as it [makes training use less memory](https://ai.stackexchange.com/questions/41485/while-fine-tuning-a-decoder-only-llm-like-llama-on-chat-dataset-what-kind-of-pa).


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)

tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
def generate_and_tokenize_prompt(example):
    full_prompt =f"""Vous êtes un expert en data science et en traitement du langage naturel(NLP). Votre tâche consiste à extraire les triplets du TEXTE fourni ci-dessous. Un triplet de connaissances est constitué de 2 entités (sujet et objet) liées par un prédicat : ['sujet', 'prédicat', 'objet']. Les triples multiples doivent être séparés par ' | '.\n

### TEXTE:
{example["text"]}

### Relations:
{get_relation(example)}
"""
    return tokenize(full_prompt)
    # return full_prompt

In [None]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

Map:   0%|          | 0/416 [00:00<?, ? examples/s]

### 4. Set Up LoRA
Now, to start our fine-tuning, we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
print_trainable_parameters(model)

trainable params: 0 || all params: 3752071168 || trainable%: 0.0


In [None]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRM

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

trainable params: 85041152 || all params: 3837112320 || trainable%: 2.2162799758751914


In [None]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=2)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): Linear4bit(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
              )
              (k_proj): Linear4bit(
                (lora_dropou

In [18]:
import os, wandb
# os.environ["WANDB_PROJECT"] = "digital_safety"
os.environ["WANDB_BASE_URL"]="https://api.wandb.ai"
wandb.init(project="digital_safety", entity="xianli")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mxianli[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [19]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [20]:
import transformers
from datetime import datetime

project = "KG-finetune"
base_model_name = "zephyr"
run_name = base_model_name + "-" + project
output_dir = "./models/" + run_name

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=20,
        gradient_accumulation_steps=4,
        max_steps=500,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=50,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=50,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",          # Name of the W&B run (optional)
        load_best_model_at_end=True
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[transformers.EarlyStoppingCallback(3)]
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,0.9404,0.65184
100,0.6155,0.608363
150,0.558,0.595109
200,0.5128,0.608019




KeyboardInterrupt: 

### 6. Drum Roll... Try the Trained Model!

By default, the PEFT library will only save the QLoRA adapters, so we need to first load the base Mistral model from the Huggingface Hub:

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# base_model_id = "bofenghuang/vigostral-7b-chat"
base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
    # use_auth_token=True
)

tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model, "../fine-tuning/models/mistral-KG-finetune/checkpoint-150")

In [14]:
ft_model.save_pretrained("/home/ubuntu/fine-tuning/models/merged_model")

In [12]:
example = test_dataset[1]
print("Texte: " + example['text'])
print("Relations: " + get_relation(example) + "\n")

eval_prompt = f"""Vous êtes un expert en data science et en traitement du langage naturel(NLP). Votre tâche consiste à extraire les triplets du TEXTE fourni ci-dessous. Un triplet de connaissances est constitué de 2 entités (sujet et objet) liées par un prédicat : ['sujet', 'prédicat', 'objet']. Les triples multiples doivent être séparés par ' | '.\n

### Texte :
{example['text']}

### Relations :
"""
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    print(tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=300, pad_token_id=2)[0], skip_special_tokens=True))

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Texte: Chang'e 4 (du , de "Chang'e", déesse de la Lune dans la mythologie chinoise) est une sonde spatiale lunaire chinoise dont le lancement a eu lieu le . L'engin est une réplique de la sonde lunaire Chang'e 3, lancée en 2013. C'est le engin spatial chinois lancé vers la Lune et le deuxième à s'y poser. Chang'e 4 comprend un atterrisseur et un rover. Les deux engins spatiaux emportent plusieurs instruments dont des caméras, un spectromètre infrarouge pour mesurer la composition du sol à proximité du rover et un radar détectant la structure superficielle du sous-sol ainsi qu'un spectromètre radio pour analyser les éruptions solaires. La mission primaire doit durer 90 jours. 
Relations: [’Chang'e 4’, ’nommé d'après’, ’Chang'e’] | [’Chang'e 4’, ’instance de’, ’sonde spatiale’] | [’Chang'e 4’, ’suit’, ’Chang'e 3’]

Vous êtes un expert en data science et en traitement du langage naturel(NLP). Votre tâche consiste à extraire les triplets du TEXTE fourni ci-dessous. Un triplet de connaissan

In [11]:
# example = test_dataset[1]
print("Texte: " + example['text'])
print("Relations: " + get_relation(example) + "\n")

Texte: Chang'e 4 (du , de "Chang'e", déesse de la Lune dans la mythologie chinoise) est une sonde spatiale lunaire chinoise dont le lancement a eu lieu le . L'engin est une réplique de la sonde lunaire Chang'e 3, lancée en 2013. C'est le engin spatial chinois lancé vers la Lune et le deuxième à s'y poser. Chang'e 4 comprend un atterrisseur et un rover. Les deux engins spatiaux emportent plusieurs instruments dont des caméras, un spectromètre infrarouge pour mesurer la composition du sol à proximité du rover et un radar détectant la structure superficielle du sous-sol ainsi qu'un spectromètre radio pour analyser les éruptions solaires. La mission primaire doit durer 90 jours. 
Relations: [’Chang'e 4’, ’nommé d'après’, ’Chang'e’] | [’Chang'e 4’, ’instance de’, ’sonde spatiale’] | [’Chang'e 4’, ’suit’, ’Chang'e 3’]

