pip install -q git+https://github.com/huggingface/trl

In [1]:
# The model that you want to train from the Hugging Face hub
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

# The instruction dataset to use
dataset_name = "pechaut/cairo-instruct"

# Fine-tuned model name
new_model = "Mistral-7b-instruct-cairo"



import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

hub_name = "pechaut/Mistral-7b-instruct-cairo-instruct"

max_steps = 1000 # to tweak to get the best out of the model 

In [2]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer
import os
 
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name,
                                             trust_remote_code=True,
                                             quantization_config=bnb_config,
                                             use_auth_token=True,
                                             device_map={"": 0}
                                             )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [3]:
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
import os

model.config.use_cache=False
model.config.pretraining_tp=1
model.config.window = 256 
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


Training

Loading Dataset

In [4]:
from datasets import load_dataset

# Load the dataset
dataset_train = load_dataset(dataset_name, split="train", download_mode='force_redownload',ignore_verifications=True)
dataset_test = load_dataset(dataset_name, split="eval", download_mode='force_redownload',ignore_verifications=True)



Downloading readme:   0%|          | 0.00/527 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating eval split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/230 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/527 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating eval split:   0%|          | 0/5 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/230 [00:00<?, ? examples/s]

In [5]:



peft_config = LoraConfig(
    r=16, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head"
    ],inference_mode = False
)

model = get_peft_model(model, peft_config)
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing = True,
    optim="paged_adamw_32bit",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    save_strategy="epoch",
    logging_dir="./logs", 
    logging_steps=5,
    max_steps=max_steps,
    fp16=False,
    push_to_hub=True
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=512,
    neftune_noise_alpha=5
)


Map:   0%|          | 0/230 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]



In [6]:

trainer.train()
trainer.push_to_hub()


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
5,4.1833,4.647629
10,4.9228,4.63075
15,3.5035,4.592873
20,3.8733,4.510087
25,3.3104,4.358395
30,3.0202,4.164574
35,3.511,3.956359
40,3.8319,3.716745
45,3.5774,3.487477
50,2.6644,3.300859


adapter_model.bin:   0%|          | 0.00/170M [00:00<?, ?B/s]

'https://huggingface.co/pechaut/Mistral-7b-instruct-cairo/tree/main/'

In [2]:
from peft import AutoPeftModelForCausalLM
import torch
from transformers import  AutoTokenizer

model = AutoPeftModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=new_model, device_map="auto", 
    low_cpu_mem_usage=True,
    load_in_4bit=True,
    torch_dtype=torch.float16,
    use_low_memory=True
)
tokenizer = AutoTokenizer.from_pretrained(new_model)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Push Model To Hub

- Make sure to reinit the kernel
- We reload the base model
- Load the Peft adapter
- Push the merged model

In [2]:

from transformers import AutoModelForCausalLM
from peft import PeftModel
import torch
from transformers import  AutoTokenizer, BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
bnb_config = BitsAndBytesConfig(
    load_in_4bit=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name,
                                             torch_dtype=torch.float16,
                                             trust_remote_code=True,
                                             quantization_config=bnb_config,
                                              use_auth_token=True,
                                             device_map={"": 0}
                                             , load_in_8bit=False
                                             )

model_to_merge  = PeftModel.from_pretrained(model, new_model )
merged_model = model_to_merge.merge_and_unload()
#model.save_pretrained("cairo-mistral")
model.push_to_hub(hub_name)

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/pechaut/Mistral-7b-instruct-cairo-instruct/commit/0985687ba7e07d01dbcfb3e99766e49c7abbe55d', commit_message='Upload MistralForCausalLM', commit_description='', oid='0985687ba7e07d01dbcfb3e99766e49c7abbe55d', pr_url=None, pr_revision=None, pr_num=None)

Load model from hub for inference

- If you just need inference, run this
- we load the model from HFace Hub in 4 bits


In [2]:
import torch
from transformers import AutoTokenizer, TextStreamer, GenerationConfig, BitsAndBytesConfig
from attention_sinks import AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=hub_name,
                                             trust_remote_code=True,
                                             device_map={"": 0},
                                             attention_sink_size=4,
                                             quantization_config=bnb_config,
                                            attention_sink_window_size=252, # <- Low for the sake of faster generation
                                             )
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

Downloading (…)/adapter_config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Downloading adapter_model.bin:   0%|          | 0.00/170M [00:00<?, ?B/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(
            in_features=4096, out_features=4096, bias=False
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): Linear4bit(
            in_features=4096, out_features=1024, bias=False
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
       

In [4]:
from transformers import TextStreamer, GenerationConfig


#prompt = "Create an array and append some animal names"
#prompt = "give an exemple of constructor"
#prompt="create an array 'messages' that contains a u128, a u32, a u256"
#prompt = "create a structure for mailAccount"
#prompt = "create an array of felt and append 1 to the array"
#prompt = "create a felt and affect it a value of 1"
prompt="create a function for fibonacci"
#prompt = "what are spans used for"
#prompt = "How do I know if an array is empty"
#prompt = "what makes Cairo special"
#prompt = "Create an array and append some domestic animal names"
text =f"[INST]I'm working in Cairo. You are a cairo expert answer the question exactly and be concise, answer in less than 200 words: {prompt} [/INST]"

input_ids = tokenizer.encode(text, return_tensors="pt").to(model.device)

with torch.no_grad():
    streamer = TextStreamer(tokenizer)
    generated_tokens = model.generate(
        input_ids,
        generation_config=GenerationConfig(
            # use_cache=True is required, the rest can be changed up.
            use_cache=True,
            min_new_tokens=100,
            max_new_tokens=300,
            penalty_alpha=0.6,
            top_k=50,
            do_sample=True,
            top_p=0.95,
            temperature=0.01,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        ),
        streamer=streamer,
    )
    # Decode the final generated text
    output_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

<s> [INST]I'm working in Cairo. You are a cairo expert answer the question exactly and be concise, answer in less than 200 words: create a function for fibonacci [/INST]
fn fibonacci(n: u64) -> u64 {
   if n == 0 {
       return 0;
   }
   if n == 1 {
       return 1;
   }
   return fibonacci(n - 1) + fibonacci(n - 2);
}

Explanation:
- This function computes the nth Fibonacci number.
- If n is 0, return 0. If n is 1, return 1.
- Otherwise, return the sum of the previous two Fibonacci numbers.

Remarks:
- This implementation is not optimized for large values of n.
- For larger values of n, you may want to use a loop or recursion with tail recursion to optimize the code.
- The function is defined with a parameter n of type u64, which is an 8-bit unsigned int.
- The function returns a u64, which is also an 8-bit unsigned int.
- The function is marked as 'fn' to indicate that it is a function definition.
- The function has a parameter n and returns a u64.
- The function uses pattern match