In [1]:
from transformers.trainer_utils import set_seed
import torch

SEED = 6
set_seed(SEED)

In [2]:
from transformers import AutoConfig, MistralConfig


mistral_config = MistralConfig.from_pretrained("openaccess-ai-collective/tiny-mistral", num_hidden_layers = 1, use_cache = False, hidden_size = 8, num_attention_heads = 4, 
                                           output_hidden_states=True,  num_key_value_heads = 2, past_key_values = True, intermediate_size = 8, sliding_window = 3, dropout_p = 0)


mistral_config

MistralConfig {
  "_name_or_path": "./tiny-mistral",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dropout_p": 0,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 8,
  "initializer_range": 0.02,
  "intermediate_size": 8,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 4,
  "num_hidden_layers": 1,
  "num_key_value_heads": 2,
  "output_hidden_states": true,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 3,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.38.0.dev0",
  "use_cache": false,
  "vocab_size": 32000
}

In [3]:
from transformers import AutoModel

tinymistral = AutoModel.from_config(mistral_config)

In [4]:
from transformers import AutoTokenizer

src_sent = "hi how are you doing"

mistal_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

In [5]:
tokenized_src_dict = mistal_tokenizer.encode_plus(src_sent, return_tensors='pt')
tokenized_src_dict

{'input_ids': tensor([[    1, 12014,   910,   460,   368,  2548]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [6]:
tokenized_src_dict

{'input_ids': tensor([[    1, 12014,   910,   460,   368,  2548]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [7]:
src_tokenized = tokenized_src_dict["input_ids"]
src_tokenized

tensor([[    1, 12014,   910,   460,   368,  2548]])

In [8]:
mistal_tokenizer.decode(*src_tokenized)

'<s> hi how are you doing'

In [9]:
src_tokenized.shape

torch.Size([1, 6])

In [10]:
from pprint import pprint 

pprint(tokenized_src_dict)

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[    1, 12014,   910,   460,   368,  2548]])}


In [11]:
torch.ones

<function torch._VariableFunctionsClass.ones>

In [12]:
seq_length = src_tokenized.shape[1]
sliding_window_len = 3

sliding_window_mask = 1 - (torch.triu(torch.ones(seq_length, seq_length), diagonal=1))


for i in range(sliding_window_mask.shape[0]-1, -1, -1):

    li = i - sliding_window_len + 1


    if li > 0:

        sliding_window_mask[i][0:li] = 0

sliding_window_mask

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [0., 1., 1., 1., 0., 0.],
        [0., 0., 1., 1., 1., 0.],
        [0., 0., 0., 1., 1., 1.]])

In [13]:
sliding_window_mask = sliding_window_mask.unsqueeze(0)
sliding_window_mask = sliding_window_mask.unsqueeze(0)
sliding_window_mask.shape, sliding_window_mask

(torch.Size([1, 1, 6, 6]),
 tensor([[[[1., 0., 0., 0., 0., 0.],
           [1., 1., 0., 0., 0., 0.],
           [1., 1., 1., 0., 0., 0.],
           [0., 1., 1., 1., 0., 0.],
           [0., 0., 1., 1., 1., 0.],
           [0., 0., 0., 1., 1., 1.]]]]))

In [14]:
tokenized_src_dict["attention_mask"] = sliding_window_mask

pprint(tokenized_src_dict)


{'attention_mask': tensor([[[[1., 0., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 0., 0., 0.],
          [0., 1., 1., 1., 0., 0.],
          [0., 0., 1., 1., 1., 0.],
          [0., 0., 0., 1., 1., 1.]]]]),
 'input_ids': tensor([[    1, 12014,   910,   460,   368,  2548]])}


In [15]:
output = tinymistral(**tokenized_src_dict)

sdpa

###########################################################################
LLAMA DECODER FWD START

Attention mask =  tensor([[[[ 0.0000e+00, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38,
           -3.4028e+38],
          [ 0.0000e+00,  0.0000e+00, -3.4028e+38, -3.4028e+38, -3.4028e+38,
           -3.4028e+38],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00, -3.4028e+38, -3.4028e+38,
           -3.4028e+38],
          [-3.4028e+38,  0.0000e+00,  0.0000e+00,  0.0000e+00, -3.4028e+38,
           -3.4028e+38],
          [-3.4028e+38, -3.4028e+38,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           -3.4028e+38],
          [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00,  0.0000e+00,
            0.0000e+00]]]])

Input (hidden states) =  tensor([[[-0.0074, -0.0254,  0.0067,  0.0112,  0.0315,  0.0374,  0.0165,
          -0.0181],
         [-0.0131,  0.0218,  0.0225,  0.0525,  0.0102,  0.0363,  0.0032,
          -0.0137],
         [-0.0309,  0.0209, -0.0214,  0.0265, -0.0147,

In [17]:
# from transformers import AutoTokenizer, MistralForCausalLM

# model = MistralForCausalLM.from_pretrained("openaccess-ai-collective/tiny-mistral")


# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

# prompt = "Hey, are you conscious? Can you talk to me?"
# inputs = tokenizer(prompt, return_tensors="pt")

# # Generate
# generate_ids = model.generate(inputs.input_ids, max_length=30)
# tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]