<a href="https://colab.research.google.com/github/Slebbon/TextGeneration_Projet_PSL_EnC/blob/main/quantized_Mistral_7B_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install autoawq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from datasets import load_dataset, DatasetDict

In [4]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [6]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj","k_proj",'v_proj'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 4718592 || all params: 3756789760 || trainable%: 0.12560170521759514


  ## Parameters & Setup

In [7]:
#setup parameters regarding GPU availibility on the machine and recycle used memory
import torch;
import gc;

is_gpu_available = torch.cuda.is_available()
device = 'cuda' if is_gpu_available else 'cpu'
if is_gpu_available:
    print("GPU available for notebook")
    torch.cuda.empty_cache()
    print("GPU Memory cleaned")
else:
    print("No GPU available for notebook")

gc.collect()


GPU available for notebook
GPU Memory cleaned


0

## Dataset

In [8]:
dataset = load_dataset("/content/", data_files={
    'train': f'train.csv',
    'validation': f'validation.csv',
    'test': f'test.csv'
})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Text', 'Author'],
        num_rows: 21054
    })
    validation: Dataset({
        features: ['Text', 'Author'],
        num_rows: 4513
    })
    test: Dataset({
        features: ['Text', 'Author'],
        num_rows: 4512
    })
})


  ## Tokenization

In [9]:
#from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, AutoModelForCausalLM

#model = AutoModelForCausalLM.from_pretrained(pretrained_model)
# Get the maximum context size
max_length = model.config.max_position_embeddings
print(f"Maximum context size: {max_length}")

Maximum context size: 32768


## Get % of data

In [10]:
train_10 = dataset['train'].train_test_split(test_size=0.10)['test']
dataset['train'] = train_10

In [11]:
valid_10 = dataset['validation'].train_test_split(test_size=0.10)['test']
dataset['validation'] = valid_10

In [12]:
test_10 = dataset['test'].train_test_split(test_size=0.10)['test']
dataset['test'] = test_10

## Tokenize

In [13]:
#from transformers import GPT2Tokenizer
#tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    #model_max_length=512,
    padding_side="left",
    add_eos_token=True)

def tokenize_function(examples):
    return tokenizer(examples["Text"],max_length=max_length)


# Apply the tokenization function to the entire dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=10,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/2106 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Map:   0%|          | 0/452 [00:00<?, ? examples/s]

Map:   0%|          | 0/452 [00:00<?, ? examples/s]

In [14]:
sample_token = tokenizer.encode("Live long and prosper.")
print(sample_token)

[1, 11635, 1043, 304, 20814, 28723, 2]


## Data Collator

In [15]:
#We need to create data collator to manage the batches, we can use DataCollatorForLanguageModeling
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)
# Iterate over the generator
out = data_collator([tokenized_dataset["train"][i] for i in range(1)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([1, 13])
attention_mask shape: torch.Size([1, 13])
labels shape: torch.Size([1, 13])


## Setup the Trainer

In [16]:
#Now we train the model using the Trainer API
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    'outputs',
    eval_strategy="steps",
    save_strategy="steps",
    learning_rate=2e-4,
    num_train_epochs=1,
    weight_decay=0.01,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    bf16=True,
    optim="paged_adamw_8bit",
    save_steps=0.1,
    eval_steps=0.1,
    save_total_limit=5,
    resume_from_checkpoint=True
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

## Evaluate the Performance of the Base Model

In [17]:
import math

#Calculate and report on perplexity
initial_results = trainer.evaluate()
print(initial_results)
#log the results to file
#logger.info(f"Baseline {pretrained_model} Results: Perplexity: {math.exp(initial_results['eval_loss']):.2f}")

KeyboardInterrupt: 

In [None]:
print(f"Baseline results: Perplexity: {math.exp(initial_results['eval_loss']):.2f}")


In [None]:
#setup our test prompts
test_prompt = "What is the meaning of life?"
test_prompt2 = "Where did that planet go??"
test_prompt3 = "What is the best way to cook a turkey?"

In [19]:
#Use the model in a pipeline to generate text.
from transformers import pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)


result = text_generator(test_prompt, max_length=50, num_return_sequences=1,temperature=1)
print(f"Baseline generated result: {test_prompt}...{result[0]['generated_text']}")
#logger.info(f"Baseline {pretrained_model} generated result: {test_prompt}...{result[0]['generated_text']}")

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalL

Baseline generated result: What is the meaning of life?...What is the meaning of life?

The meaning of life is to live.

## What is the meaning of life in a sentence?

The meaning of life is to live.

## What is the meaning of life in


In [20]:
result = text_generator(test_prompt2, max_length=50, num_return_sequences=1,temperature=1)
print(f"Baseline generated result: {test_prompt2}...{result[0]['generated_text']}")
#logger.info(f"Baseline {pretrained_model} generated result: {test_prompt2}...{result[0]['generated_text']}")

Baseline generated result: Where did that planet go??...Where did that planet go???

I’m not sure if I’m going to be able to get back to this one. I’m not sure if I’m going to be able to get back to any of my projects


In [21]:
result = text_generator(test_prompt3, max_length=100, num_return_sequences=1,temperature=1)
print(f"Baseline generated result: {test_prompt3}...{result[0]['generated_text']}")
#logger.info(f"Baseline {pretrained_model} generated result: {test_prompt3}...{result[0]['generated_text']}")

Baseline generated result: What is the best way to cook a turkey?...What is the best way to cook a turkey?

The best way to cook a turkey is to roast it in the oven. This method is simple and easy to follow, and it produces a delicious and juicy bird.

## What is the best way to cook a turkey breast?

There are many ways to cook a turkey breast, but the best way is to roast it in the oven. This method is simple and easy to follow, and


## Fine-Tune the Model

In [18]:
trainer.train(resume_from_checkpoint=True)

	eval_steps: 0.1 (from args) != 211 (from trainer_state.json)
	save_steps: 0.1 (from args) != 211 (from trainer_state.json)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
1899,3.8193,3.731759




TrainOutput(global_step=2106, training_loss=0.7262907834247759, metrics={'train_runtime': 746.3397, 'train_samples_per_second': 2.822, 'train_steps_per_second': 2.822, 'total_flos': 1735355477458944.0, 'train_loss': 0.7262907834247759, 'epoch': 1.0})

## Load model from checkpoint and run tests

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
!cp -r /content/outputs/checkpoint-2106 /content/drive/MyDrive

In [None]:
del model

In [26]:
del trainer

In [22]:
torch.cuda.empty_cache()

In [25]:
base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [29]:
from peft import PeftModel, PeftConfig

In [30]:
model = PeftModel.from_pretrained(model, '/content/outputs/checkpoint-1688', is_trainable=True)

In [31]:
print_trainable_parameters(model)

trainable params: 4718592 || all params: 3756789760 || trainable%: 0.12560170521759514


In [32]:
tokenizer = AutoTokenizer.from_pretrained('/content/outputs/checkpoint-1688')

In [23]:
#setup our test prompts
test_prompt = "What is the meaning of life?"
test_prompt2 = "Where did that planet go??"
test_prompt3 = "What is the best way to cook a turkey?"

In [24]:
#Use the model in a pipeline to generate text.
from transformers import pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)


result = text_generator(test_prompt, max_length=50, num_return_sequences=1,temperature=1)
print(f"Baseline generated result: {test_prompt}...{result[0]['generated_text']}")
#logger.info(f"Baseline {pretrained_model} generated result: {test_prompt}...{result[0]['generated_text']}")

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalL

Baseline generated result: What is the meaning of life?...What is the meaning of life?

The meaning of life is to give life meaning.

The meaning of life is to give life meaning.

The meaning of life is to give life meaning.

The meaning of life is


In [25]:
result = text_generator(test_prompt2, max_length=100, num_return_sequences=1,temperature=1)
print(f"Baseline generated result: {test_prompt2}...{result[0]['generated_text']}")
#logger.info(f"Baseline {pretrained_model} generated result: {test_prompt2}...{result[0]['generated_text']}")

Baseline generated result: Where did that planet go??...Where did that planet go??? #Trump2016 https://t.co/7777777777 https://t.co/7777777777 https://t.co/7777777777 https://t.co/7777777777 https://t.co/7777777777 https://t.co/


In [27]:
result = text_generator(test_prompt3, max_length=100, num_return_sequences=1,temperature=1)
print(f"Baseline generated result: {test_prompt3}...{result[0]['generated_text']}")
#logger.info(f"Baseline {pretrained_model} generated result: {test_prompt3}...{result[0]['generated_text']}")

Baseline generated result: What is the best way to cook a turkey?...What is the best way to cook a turkey?

The best way to cook a turkey is to roast it in the oven.

### What is the best way to cook a turkey?

The best way to cook a turkey is to roast it in the oven.

### What is the best way to cook a turkey?

The best way to cook a turkey is to roast it in the oven.

### What


In [26]:
result = text_generator('Oh Sir, how are you?', max_length=100, num_return_sequences=1,temperature=1)
print(f"Baseline generated result: {'Oh Sir, how are you?'}...{result[0]['generated_text']}")

Baseline generated result: Oh Sir, how are you?...Oh Sir, how are you? I am glad to see you. I have been looking for you. I have been looking for you. I have been looking for you. I have been looking for you. I have been looking for you. I have been looking for you. I have been looking for you. I have been looking for you. I have been looking for you. I have been looking for you. I have been looking for you. I have been looking for you. I have


## Evaluate the Performance of the Fine-Tuned Model

In [37]:
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)

In [38]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    'outputs',
    eval_strategy="steps",
    save_strategy="steps",
    learning_rate=2e-4,
    num_train_epochs=1,
    weight_decay=0.01,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    bf16=True,
    optim="paged_adamw_8bit",
    save_steps=0.1,
    eval_steps=0.1,
    save_total_limit=5,
    resume_from_checkpoint=True
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

In [28]:
#Calculate and report on perplexity
eval_results = trainer.evaluate()

In [29]:
eval_results

{'eval_loss': 3.720257043838501,
 'eval_runtime': 217.0194,
 'eval_samples_per_second': 2.083,
 'eval_steps_per_second': 2.083,
 'epoch': 1.0}

In [30]:
import math
perplexity = math.exp(eval_results['eval_loss'])
eval_results['perplexity'] = perplexity

#logger.info(f"Fine-tuned {finetuned_modelname} Results: Perplexity: {perplexity:.2f}")
print(f"Fine-tuned results: Perplexity: {perplexity:.2f}")

Fine-tuned results: Perplexity: 41.28
