In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datasets
import torch
import random
import json
from tqdm.auto import tqdm
import pandas as pd
from pathlib import Path

import transformers

from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [3]:
from codet5_finetune.options import options 
from codet5_finetune.data import DataCollatorNTP

In [4]:
cwd =!pwd
cwd = Path(cwd[0])
opt = options({'common_config': cwd.parent / 'codet5_finetune/common_config.yaml'})

In [5]:
opt.path_java_filtered_subset

'/repo_data/the_stack11_dedup_alt_comments_no_1K_set_subset/data'

In [6]:
ds = datasets.load_from_disk(opt.path_java_filtered_subset)

In [7]:
len(ds['train'])

300000

In [8]:
tokenizer = AutoTokenizer.from_pretrained(opt.base_model_name)

In [9]:
data_collator = DataCollatorNTP(
    tokenizer,
    min_encoder_seq_length=opt.min_encoder_seq_length,
    min_decoder_seq_length=opt.min_decoder_seq_length,
    encoder_seq_length=opt.encoder_seq_length,
    decoder_seq_length=opt.decoder_seq_length
)

In [10]:
res = data_collator([ds['train'][0], ds['train'][1], ds['train'][2]])

Token indices sequence length is longer than the specified maximum sequence length for this model (3162 > 512). Running this sequence through the model will result in indexing errors
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [11]:
dir(data_collator)

['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'decoder_seq_length',
 'encoder_seq_length',
 'min_decoder_seq_length',
 'min_encoder_seq_length',
 'tokenizer']

In [12]:
res['labels'].shape

torch.Size([3, 512])

In [13]:
opt.model_dir_base = Path(opt.model_dir_base)
model_dir = opt.model_dir_base  / opt.trained_model_name / opt.experiment_name

args = Seq2SeqTrainingArguments(
    model_dir,
    logging_strategy=opt.logging_strategy,
    logging_steps=opt.logging_steps,
    save_strategy=opt.save_strategy,
    save_steps=opt.save_steps,
    learning_rate=opt.learning_rate,# if wold have been perfect 4e-6 and several epochs
    per_device_train_batch_size=opt.per_device_train_batch_size,
    per_device_eval_batch_size=opt.per_device_eval_batch_size,
    weight_decay=opt.weight_decay,
    save_total_limit=opt.save_total_limit,
    num_train_epochs=opt.num_train_epochs,
    fp16=opt.fp16,
    load_best_model_at_end=False,
    report_to=opt.report_to,
    remove_unused_columns=False
)

In [14]:
train_dataset = ds['train']

In [15]:
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(opt.base_model_name)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=None,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=None
)

loading configuration file config.json from cache at /home/toolkit/.cache/huggingface/hub/models--Salesforce--codet5-base/snapshots/4078456db09ba972a3532827a0b5df4da172323c/config.json
Model config T5Config {
  "_name_or_path": "Salesforce/codet5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 1,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_para

In [16]:
trainer.train()

loading configuration file config.json from cache at /home/toolkit/.cache/huggingface/hub/models--Salesforce--codet5-base/snapshots/4078456db09ba972a3532827a0b5df4da172323c/config.json
Model config T5Config {
  "_name_or_path": "Salesforce/codet5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 1,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_para

Step,Training Loss
100,2.8991
200,2.0711
300,1.8892
400,1.8001
500,1.7598
600,1.6997
700,1.6449
800,1.6604
900,1.6297
1000,1.6376


Saving model checkpoint to /repo_data/finetuning_checkpoints/codet5-base-ntp-java/tests/checkpoint-200
Configuration saved in /repo_data/finetuning_checkpoints/codet5-base-ntp-java/tests/checkpoint-200/config.json
Configuration saved in /repo_data/finetuning_checkpoints/codet5-base-ntp-java/tests/checkpoint-200/generation_config.json
Model weights saved in /repo_data/finetuning_checkpoints/codet5-base-ntp-java/tests/checkpoint-200/pytorch_model.bin
tokenizer config file saved in /repo_data/finetuning_checkpoints/codet5-base-ntp-java/tests/checkpoint-200/tokenizer_config.json
Special tokens file saved in /repo_data/finetuning_checkpoints/codet5-base-ntp-java/tests/checkpoint-200/special_tokens_map.json
Saving model checkpoint to /repo_data/finetuning_checkpoints/codet5-base-ntp-java/tests/checkpoint-400
Configuration saved in /repo_data/finetuning_checkpoints/codet5-base-ntp-java/tests/checkpoint-400/config.json
Configuration saved in /repo_data/finetuning_checkpoints/codet5-base-ntp-ja

KeyboardInterrupt: 

In [None]:
train_dataset._getitem(239039)