In [1]:
import torch
import transformers
import sys
sys.path.append('..')
import structformer
import data_ptb_subword

In [20]:
pretrained_model = torch.load('../trained_models/babylm_1111_sf_2.pt')[0]

In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained('omarmomen/babylm_bpe_tokenizer_16k')

In [4]:
config = structformer.StructFormerConfig(
    hidden_size=pretrained_model.hidden_size,
    nlayers=pretrained_model.nlayers,
    ntokens=pretrained_model.ntokens,
    nhead=pretrained_model.nhead,
    pos_emb=True if pretrained_model.pos_emb else False,
    pad=pretrained_model.pad,
    n_parser_layers=pretrained_model.n_parse_layers,
    relations=pretrained_model.relations,
    weight_act=pretrained_model.weight_act
    )

In [5]:
model = structformer.StructFormerModel(config).to('cuda')

In [6]:
dataset = data_ptb_subword.SubWord_Corpus_Custom("omarmomen/babylm_bpe_tokenizer_16k", "../data/babylm_10M/")

Tokenizing data...


1015494it [00:45, 22445.54it/s]


Tokenizing data...


96105it [00:04, 20769.40it/s]


Tokenizing data...


95681it [00:04, 19893.30it/s]


In [7]:
output1 = model(torch.LongTensor([dataset.test[0]]).to('cuda'))

In [8]:
output1['logits'].shape

torch.Size([1, 8, 16000])

In [10]:
tokenizer.decode(dataset.test[0])

'All round  Street and that way.'

In [11]:
tokenizer.decode(output1['logits'].argmax(-1)[0])

'CheckFireCheck localitiesCheckianWhoever device'

In [12]:
output1.loss

In [21]:
loaded_state_dict = pretrained_model.state_dict()

In [22]:
loaded_state_dict = {f"model.{k}": v for k, v in loaded_state_dict.items()}

In [23]:
model.load_state_dict(loaded_state_dict)

<All keys matched successfully>

In [16]:
output2 = model(torch.LongTensor([dataset.test[0]]).to('cuda'), labels=torch.LongTensor([dataset.test[0]]).to('cuda'))

In [17]:
tokenizer.decode(dataset.test[0])

'All round  Street and that way.'

In [18]:
tokenizer.decode(output2['logits'].argmax(-1)[0])

'All round  Street, that way.'

In [19]:
output2.loss

tensor(0.6461, device='cuda:0', grad_fn=<NllLossBackward0>)

In [27]:
structformer.StructFormerConfig.register_for_auto_class()

In [28]:
structformer.StructFormerModel.register_for_auto_class("AutoModelForMaskedLM")

In [31]:
structformer.StructFormerModelForSequenceClassification.register_for_auto_class("AutoModelForMaskedLM")

In [24]:
config.save_pretrained('../saved_hf_models/sf_babylm_1')

In [25]:
tokenizer.save_pretrained('../saved_hf_models/sf_babylm_1')

('../saved_hf_models/sf_babylm_1/tokenizer_config.json',
 '../saved_hf_models/sf_babylm_1/special_tokens_map.json',
 '../saved_hf_models/sf_babylm_1/vocab.json',
 '../saved_hf_models/sf_babylm_1/merges.txt',
 '../saved_hf_models/sf_babylm_1/added_tokens.json',
 '../saved_hf_models/sf_babylm_1/tokenizer.json')

In [26]:
model.save_pretrained('../saved_hf_models/sf_babylm_1')

In [27]:
structformer.StructFormerConfig.register_for_auto_class()

In [28]:
structformer.StructFormerModel.register_for_auto_class("AutoModelForMaskedLM")

In [29]:
structformer.StructFormerModelForSequenceClassification.register_for_auto_class("AutoModelForMaskedLM")

In [30]:
tokenizer.push_to_hub("sf_babylm_1", use_temp_dir=True)

Cloning https://huggingface.co/omarmomen/sf_babylm_1 into local empty directory.
To https://huggingface.co/omarmomen/sf_babylm_1
   6425019..257e44a  main -> main



'https://huggingface.co/omarmomen/sf_babylm_1/commit/257e44ae8dccb0f3184824752e1166e343586394'

In [31]:
model.push_to_hub("sf_babylm_1", use_temp_dir=True)

Cloning https://huggingface.co/omarmomen/sf_babylm_1 into local empty directory.


Upload file pytorch_model.bin:   0%|          | 1.00/159M [00:00<?, ?B/s]

To https://huggingface.co/omarmomen/sf_babylm_1
   257e44a..4ef095f  main -> main



'https://huggingface.co/omarmomen/sf_babylm_1/commit/4ef095f84ae19d50ad8da3ea0aa207fc6bbccfe5'

In [43]:
tokenizer = transformers.AutoTokenizer.from_pretrained('omarmomen/sf_babylm_1')
model = transformers.AutoModelForMaskedLM.from_pretrained('omarmomen/sf_babylm_1', trust_remote_code=True)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.


Downloading:   0%|          | 0.00/18.7k [00:00<?, ?B/s]

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Downloading:   0%|          | 0.00/159M [00:00<?, ?B/s]

In [46]:
model.to('cuda')

StructFormerModel(
  (model): StructFormer(
    (drop): Dropout(p=0.1, inplace=False)
    (emb): Embedding(16000, 512)
    (pos_emb): Embedding(500, 512)
    (layers): ModuleList(
      (0): TransformerLayer(
        (self_attn): MultiheadAttention(
          (drop): Dropout(p=0.1, inplace=False)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (k_proj): Linear(in_features=512, out_features=512, bias=True)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (feedforward): Sequential(
          (0): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (1): Linear(in_features=512, out_features=2048, bias=True)
          (2): LeakyReLU(negative_slope=0.01)
          (3): Dropout(p=0.1, inplace=False)
          (4): Linear(in_features=2048, out_features=512, bias=True)
        )
        (norm): LayerNorm((512,), eps=1e-05, elementwis

In [55]:
output = model(torch.LongTensor([dataset.test[0]]).to('cuda'), labels=torch.LongTensor([dataset.test[0]]).to('cuda'))

In [56]:
tokenizer.decode(dataset.test[0])

'All round  Street and that way.'

In [57]:
tokenizer.decode(output['logits'].argmax(-1)[0])

'All round  Street, that way.'

In [51]:
output.loss

tensor(0.3652, device='cuda:0', grad_fn=<NllLossBackward0>)

In [62]:
tokenizer = transformers.AutoTokenizer.from_pretrained('omarmomen/sf_babylm_1')
model     = transformers.AutoModelForSequenceClassification.from_pretrained('omarmomen/sf_babylm_1', trust_remote_code=True)

Downloading:   0%|          | 0.00/774 [00:00<?, ?B/s]

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.


Downloading:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Some weights of StructFormerModelForSequenceClassification were not initialized from the model checkpoint at omarmomen/sf_babylm_1 and are newly initialized: ['model.classifier.out_proj.bias', 'model.classifier.dense.weight', 'model.classifier.out_proj.weight', 'model.classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
model.to('cuda')

StructFormerModelForSequenceClassification(
  (model): StructFormer(
    (drop): Dropout(p=0.1, inplace=False)
    (emb): Embedding(16000, 512)
    (pos_emb): Embedding(500, 512)
    (layers): ModuleList(
      (0): TransformerLayer(
        (self_attn): MultiheadAttention(
          (drop): Dropout(p=0.1, inplace=False)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (k_proj): Linear(in_features=512, out_features=512, bias=True)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (feedforward): Sequential(
          (0): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (1): Linear(in_features=512, out_features=2048, bias=True)
          (2): LeakyReLU(negative_slope=0.01)
          (3): Dropout(p=0.1, inplace=False)
          (4): Linear(in_features=2048, out_features=512, bias=True)
        )
        (norm): LayerNorm((512

In [65]:
output = model(torch.LongTensor([dataset.test[0]]).to('cuda'))

In [67]:
output.logits

tensor([[ 0.0475, -0.0517]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
tokenizer.decode(dataset.test[0])

'All round  Street and that way.'

In [None]:
tokenizer.decode(output['logits'].argmax(-1)[0])

'All round  Street, that way.'

In [None]:
output.loss

tensor(0.3652, device='cuda:0', grad_fn=<NllLossBackward0>)

In [1]:
import torch
import transformers
import sys
sys.path.append('..')
import structformer_in_parser
import data_ptb_subword

In [2]:
pretrained_model = torch.load('../trained_models/babylm_1111_in_parser_sf.pt')[0]

In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained('omarmomen/babylm_bpe_tokenizer_16k')

In [9]:
config = structformer_in_parser.StructFormer_In_ParserConfig(
    hidden_size=pretrained_model.transformer_front.hidden_size,
    front_layers=pretrained_model.transformer_front.nlayers,
    rear_layers=pretrained_model.transformer_rear.nlayers,
    ntokens=pretrained_model.transformer_front.ntokens,
    nhead=pretrained_model.transformer_front.nhead,
    pos_emb=True if pretrained_model.transformer_front.pos_emb else False,
    n_parser_layers=pretrained_model.n_parse_layers,
    pad=pretrained_model.transformer_front.pad
    )

In [10]:
model = structformer_in_parser.StructFormer_In_ParserModel(config).to('cuda')

In [11]:
loaded_state_dict = pretrained_model.state_dict()

In [12]:
loaded_state_dict = {f"model.{k}": v for k, v in loaded_state_dict.items()}

In [13]:
model.load_state_dict(loaded_state_dict)

<All keys matched successfully>

In [14]:
structformer_in_parser.StructFormer_In_ParserConfig.register_for_auto_class()

In [15]:
structformer_in_parser.StructFormer_In_ParserModel.register_for_auto_class("AutoModelForMaskedLM")

In [16]:
structformer_in_parser.StructFormer_In_ParserModelForSequenceClassification.register_for_auto_class("AutoModelForSequenceClassification")

In [17]:
config.save_pretrained('../saved_hf_models/sf_ip_babylm_1')

In [18]:
tokenizer.save_pretrained('../saved_hf_models/sf_ip_babylm_1')

('../saved_hf_models/sf_ip_babylm_1/tokenizer_config.json',
 '../saved_hf_models/sf_ip_babylm_1/special_tokens_map.json',
 '../saved_hf_models/sf_ip_babylm_1/vocab.json',
 '../saved_hf_models/sf_ip_babylm_1/merges.txt',
 '../saved_hf_models/sf_ip_babylm_1/added_tokens.json',
 '../saved_hf_models/sf_ip_babylm_1/tokenizer.json')

In [19]:
model.save_pretrained('../saved_hf_models/sf_ip_babylm_1')

In [20]:
structformer_in_parser.StructFormer_In_ParserConfig.register_for_auto_class()

In [21]:
structformer_in_parser.StructFormer_In_ParserModel.register_for_auto_class("AutoModelForMaskedLM")

In [22]:
structformer_in_parser.StructFormer_In_ParserModelForSequenceClassification.register_for_auto_class("AutoModelForSequenceClassification")

In [23]:
tokenizer.push_to_hub("sf_ip_babylm_1", use_temp_dir=True)

Cloning https://huggingface.co/omarmomen/sf_ip_babylm_1 into local empty directory.
To https://huggingface.co/omarmomen/sf_ip_babylm_1
   b0d898e..8c4e1ea  main -> main



'https://huggingface.co/omarmomen/sf_ip_babylm_1/commit/8c4e1eaffe20356e67b6814ef30602843c78330d'

In [24]:
model.push_to_hub("sf_ip_babylm_1", use_temp_dir=True)

Cloning https://huggingface.co/omarmomen/sf_ip_babylm_1 into local empty directory.


Upload file pytorch_model.bin:   0%|          | 1.00/159M [00:00<?, ?B/s]

To https://huggingface.co/omarmomen/sf_ip_babylm_1
   8c4e1ea..a1cab64  main -> main



'https://huggingface.co/omarmomen/sf_ip_babylm_1/commit/a1cab646eec7d5201750aec75806ebdea575cf2b'

In [None]:
import torch
import transformers
import sys
sys.path.append('..')
import structformer
import data_ptb_subword

In [None]:
pretrained_model = torch.load('../trained_models/babylm_1111_tf.pt')[0]

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained('omarmomen/babylm_bpe_tokenizer_16k')

In [None]:
config = structformer.TransformerConfig(
    hidden_size=pretrained_model.hidden_size,
    nlayers=pretrained_model.nlayers,
    ntokens=pretrained_model.ntokens,
    nhead=pretrained_model.nhead,
    pos_emb=True if pretrained_model.pos_emb else False,
    pad=pretrained_model.pad
    )

In [None]:
model = structformer.TransformerModel(config).to('cuda')

In [None]:
loaded_state_dict = pretrained_model.state_dict()

In [None]:
loaded_state_dict = {f"model.{k}": v for k, v in loaded_state_dict.items()}

In [None]:
model.load_state_dict(loaded_state_dict)

<All keys matched successfully>

In [None]:
structformer.TransformerConfig.register_for_auto_class()

In [None]:
structformer.TransformerModel.register_for_auto_class("AutoModelForMaskedLM")

In [None]:
structformer.TransformerModelForSequenceClassification.register_for_auto_class("AutoModelForSequenceClassification")

In [None]:
config.save_pretrained('../saved_hf_models/tf_babylm_1')

In [None]:
tokenizer.save_pretrained('../saved_hf_models/tf_babylm_1')

('../saved_hf_models/tf_babylm_1/tokenizer_config.json',
 '../saved_hf_models/tf_babylm_1/special_tokens_map.json',
 '../saved_hf_models/tf_babylm_1/vocab.json',
 '../saved_hf_models/tf_babylm_1/merges.txt',
 '../saved_hf_models/tf_babylm_1/added_tokens.json',
 '../saved_hf_models/tf_babylm_1/tokenizer.json')

In [None]:
model.save_pretrained('../saved_hf_models/tf_babylm_1')

In [None]:
structformer.TransformerConfig.register_for_auto_class()

In [None]:
structformer.TransformerModel.register_for_auto_class("AutoModelForMaskedLM")

In [None]:
structformer.TransformerModelForSequenceClassification.register_for_auto_class("AutoModelForSequenceClassification")

In [None]:
tokenizer.push_to_hub("tf_babylm_1", use_temp_dir=True)

Cloning https://huggingface.co/omarmomen/tf_babylm_1 into local empty directory.
To https://huggingface.co/omarmomen/tf_babylm_1
   e7f5a03..bfdf249  main -> main



'https://huggingface.co/omarmomen/tf_babylm_1/commit/bfdf2495895a1fe407268f60566fa515bb364526'

In [None]:
model.push_to_hub("tf_babylm_1", use_temp_dir=True)

Cloning https://huggingface.co/omarmomen/tf_babylm_1 into local empty directory.


Upload file pytorch_model.bin:   0%|          | 1.00/129M [00:00<?, ?B/s]

To https://huggingface.co/omarmomen/tf_babylm_1
   bfdf249..c799e10  main -> main



'https://huggingface.co/omarmomen/tf_babylm_1/commit/c799e1058ce933bd3661d89d5f274640b3bd5bf6'