In [1]:
# The model that you want to train from the Hugging Face hub
model_name = "mistralai/Mistral-7B-Instruct-v0.1"



# Fine-tuned model name
new_model = "StarkWizard/Mistral-7b-instruct-cairo-PEFT"



import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

hub_name = "StarkWizard/Mistral-7b-instruct-cairo-instruct"

max_steps = 1000 # to tweak to get the best out of the model 

Push Model To Hub

- Make sure to reinit the kernel
- We reload the base model
- Load the Peft adapter
- Push the merged model

NOTE:

We use CPU because of the error: Cannot merge LORA layers when the model is loaded in 8-bit mode
Merging is fast, so it's not an issue

In [2]:

from transformers import AutoModelForCausalLM
from peft import PeftModel
import torch
from transformers import  AutoTokenizer, BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name,
                                             trust_remote_code=True,
                                             low_cpu_mem_usage=True,
                                             device_map={"": "cpu"},
                                             torch_dtype=torch.float16
                                             )

model_to_merge  = PeftModel.from_pretrained(model, new_model,
                        torch_dtype=torch.float16, 
                        device_map={"": "cpu"}
                         )
merged_model = model_to_merge.merge_and_unload()
#model.save_pretrained("cairo-mistral")
merged_model.push_to_hub(hub_name,max_shard_size="1GB")
tokenizer.push_to_hub(hub_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading adapter_model.bin:   0%|          | 0.00/170M [00:00<?, ?B/s]

pytorch_model-00015-of-00015.bin:   0%|          | 0.00/816M [00:00<?, ?B/s]

pytorch_model-00012-of-00015.bin:   0%|          | 0.00/956M [00:00<?, ?B/s]

pytorch_model-00013-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00011-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00014-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Upload 15 LFS files:   0%|          | 0/15 [00:00<?, ?it/s]

pytorch_model-00010-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00009-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00008-of-00015.bin:   0%|          | 0.00/956M [00:00<?, ?B/s]

pytorch_model-00007-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00006-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00005-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00004-of-00015.bin:   0%|          | 0.00/956M [00:00<?, ?B/s]

pytorch_model-00003-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00002-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00001-of-00015.bin:   0%|          | 0.00/900M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/StarkWizard/Mistral-7b-instruct-cairo-instruct/commit/46794d4cba53428aa08c1c263eb110e34e5d413d', commit_message='Upload tokenizer', commit_description='', oid='46794d4cba53428aa08c1c263eb110e34e5d413d', pr_url=None, pr_revision=None, pr_num=None)

Load model from hub for inference

- If you just need inference, run this
- we load the model from HFace Hub in 4 bits


In [3]:
import torch
from transformers import AutoTokenizer, TextStreamer, GenerationConfig, BitsAndBytesConfig
from attention_sinks import AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=hub_name,
                                             trust_remote_code=True,
                                             device_map={"": 0},
                                             attention_sink_size=4,
                                             quantization_config=bnb_config,
                                            attention_sink_window_size=252, # <- Low for the sake of faster generation
                                             )
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading shards:   0%|          | 0/15 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00015.bin:   0%|          | 0.00/900M [00:00<?, ?B/s]

Downloading (…)l-00002-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00003-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00004-of-00015.bin:   0%|          | 0.00/956M [00:00<?, ?B/s]

Downloading (…)l-00005-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00006-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00007-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00008-of-00015.bin:   0%|          | 0.00/956M [00:00<?, ?B/s]

Downloading (…)l-00009-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00010-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00011-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00012-of-00015.bin:   0%|          | 0.00/956M [00:00<?, ?B/s]

Downloading (…)l-00013-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00014-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00015-of-00015.bin:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

[Attention Sinks] Injected Position Shifting into 32 attention classes.
[Attention Sinks] Injected Attention Sink KV Cache into 1 model class.


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )

In [17]:
from transformers import TextStreamer, GenerationConfig


#prompt = "Create an array and append some animal names"
#prompt = "give an exemple of constructor"
#prompt="create an array 'messages' that contains a u128, a u32, a u256"
#prompt = "create a structure for mailAccount"
#prompt = "create an array of felt and append 1 to the array"
#prompt = "create a felt and affect it a value of 1"
#prompt="In the example 'let y = { let x = 3; x + 1 };', what is the value of y? and explain why, give a similar sample"
#prompt="How can you print a variable's value in Cairo? Give a full sample using an array"
#prompt = "hat type of operations can be considered expressions in Cairo? Give an exemple."
#prompt = "What is a characteristic feature of Felt252Dict<T> when interacting with it? Give a sample"
#prompt = "what are spans used for"
#prompt = "How do I know if an array is empty"
#prompt = "what makes Cairo special"
#prompt = "Create an array and append some domestic animal names"
#prompt="write a contract that computes the fibonacci of caller's address and explain the weakness of the program"
prompt="write a hello world function. What are the risks?"
text =f"[INST]I'm working in Cairo. You are a cairo expert answer the question exactly and be concise, answer in less than 200 words: {prompt} [/INST]"

input_ids = tokenizer.encode(text, return_tensors="pt").to(model.device)

with torch.no_grad():
    streamer = TextStreamer(tokenizer)
    generated_tokens = model.generate(
        input_ids,
        generation_config=GenerationConfig(
            # use_cache=True is required, the rest can be changed up.
            use_cache=True,
            min_new_tokens=100,
            max_new_tokens=1000,
            penalty_alpha=0.6,
            top_k=50,
            do_sample=True,
            top_p=0.95,
            temperature=0.01,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        ),
        streamer=streamer,
    )
    # Decode the final generated text
    output_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

<s> [INST]I'm working in Cairo. You are a cairo expert answer the question exactly and be concise, answer in less than 200 words: write a hello world function. What are the risks? 

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacty of 11.90 GiB of which 108.25 MiB is free. Process 2424 has 6.80 GiB memory in use. Including non-PyTorch memory, this process has 4.90 GiB memory in use. Of the allocated memory 4.58 GiB is allocated by PyTorch, and 162.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF