In [1]:
# %pip install -r requirements.txt

In [2]:
import os
import torch
import transformers
from huggingface_hub import login
from datasets import load_dataset, Dataset
from transformers import RobertaTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
base_model = "Salesforce/codet5-base"

new_model = "CODEX-codet5-base"

tokenizer_path = "tokenizer"

dataset_name = "CodexAI/dataset"

In [4]:
login('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',add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\walim\.cache\huggingface\token
Login successful


# Dataset
Load the dataset using `load_dataset()` but the dataset must be in `.parquet` format.
or else clone the dataset repo from HF, it's fast as fuck!

In [5]:
dataset = load_dataset(dataset_name)

In [6]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['focal_method', 'test_case'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['focal_method', 'test_case'],
        num_rows: 4687
    })
})


In [7]:
train=dataset['train']
test=dataset['test']

In [8]:
train = {
    "instruction": [f"Generate a unit test case for the following Java method: {item['focal_method']}" for item in train],
    "output": [f"The unit test case for the given Java method is: {item['test_case']}" for item in train]
}

In [9]:
test = {
    "instruction": [f"Generate a unit test case for the following Java method: {item['focal_method']}" for item in test],
    "output": [f"The unit test case for the given Java method is: {item['test_case']}" for item in test]
}

## Inspecting dataset instance
Here dataset instance is printed just to see how the dataset looks like, skip these steps if your are **gay**

In [10]:
print(train['instruction'][10])

Generate a unit test case for the following Java method: static void sanityCheckExcludedRackIds(Builder builder) {
    if (builder._excludedRackIds != null && builder._numBrokers == DEFAULT_OPTIONAL_INT) {
      throw new IllegalArgumentException("Excluded rack ids can be specified only with the number of brokers.");
    }
  }


In [11]:
print(train['output'][10])

The unit test case for the given Java method is: @Test
  public void testSanityCheckExcludedRackIds() {
    // Set excluded rack ids without numBrokers
    assertThrows(IllegalArgumentException.class, () -> ProvisionRecommendation.sanityCheckExcludedRackIds(
        new ProvisionRecommendation.Builder(ProvisionStatus.UNDER_PROVISIONED).numRacks(1).excludedRackIds(Collections.singleton("1"))));
  }


In [12]:
train = Dataset.from_dict(train)
test = Dataset.from_dict(test)

In [13]:
print("Loading tokenizer...")
tokenizer = RobertaTokenizer.from_pretrained(base_model)

Loading tokenizer...




In [14]:
instruction = tokenizer(train['instruction'][0])
print(instruction)

{'input_ids': [1, 4625, 279, 2836, 1842, 648, 364, 326, 3751, 5110, 707, 30, 760, 918, 16267, 1564, 18488, 1706, 12, 1263, 2089, 13, 288, 203, 565, 509, 818, 694, 18488, 1706, 273, 2089, 6315, 12846, 1706, 11194, 548, 480, 3331, 67, 14165, 67, 3217, 692, 404, 294, 374, 31, 203, 565, 309, 261, 9574, 6315, 12846, 1706, 11194, 7437, 480, 3331, 67, 14165, 67, 17088, 13, 288, 203, 1377, 818, 694, 18488, 1706, 9904, 31, 203, 565, 289, 203, 203, 565, 309, 261, 2107, 694, 18488, 1706, 422, 404, 13, 288, 203, 1377, 604, 394, 2754, 12, 203, 1850, 514, 18, 2139, 2932, 18488, 1706, 8625, 612, 1297, 506, 1269, 598, 2097, 7519, 261, 548, 5319, 87, 27294, 5319, 87, 14944, 16, 203, 13491, 2089, 6315, 12846, 1706, 11194, 548, 422, 3331, 67, 14165, 67, 3217, 692, 7514, 294, 514, 18, 1132, 951, 12, 9574, 6315, 12846, 1706, 11194, 548, 3631, 203, 13491, 2089, 6315, 12846, 1706, 11194, 7437, 422, 3331, 67, 14165, 67, 17088, 692, 7514, 294, 514, 18, 1132, 951, 12, 9574, 6315, 12846, 1706, 11194, 7437, 3719,

In [15]:
tokens = tokenizer.convert_ids_to_tokens(instruction.input_ids)
print(tokens)

['<s>', 'Generate', 'Ġa', 'Ġunit', 'Ġtest', 'Ġcase', 'Ġfor', 'Ġthe', 'Ġfollowing', 'ĠJava', 'Ġmethod', ':', 'Ġstatic', 'Ġvoid', 'Ġsanity', 'Check', 'Typ', 'ical', '(', 'Builder', 'Ġbuilder', ')', 'Ġ{', 'Ċ', 'ĠĠĠ', 'Ġint', 'Ġnum', 'Set', 'Typ', 'ical', 'Ġ=', 'Ġbuilder', '._', 'typ', 'ical', 'Broker', 'Id', 'Ġ!=', 'ĠDEFAULT', '_', 'OPTIONAL', '_', 'INT', 'Ġ?', 'Ġ1', 'Ġ:', 'Ġ0', ';', 'Ċ', 'ĠĠĠ', 'Ġif', 'Ġ(', 'builder', '._', 'typ', 'ical', 'Broker', 'Capacity', 'Ġ!=', 'ĠDEFAULT', '_', 'OPTIONAL', '_', 'DOUBLE', ')', 'Ġ{', 'Ċ', 'ĠĠĠĠĠ', 'Ġnum', 'Set', 'Typ', 'ical', '++', ';', 'Ċ', 'ĠĠĠ', 'Ġ}', 'Ċ', 'Ċ', 'ĠĠĠ', 'Ġif', 'Ġ(', 'num', 'Set', 'Typ', 'ical', 'Ġ==', 'Ġ1', ')', 'Ġ{', 'Ċ', 'ĠĠĠĠĠ', 'Ġthrow', 'Ġnew', 'ĠIllegalArgumentException', '(', 'Ċ', 'ĠĠĠĠĠĠĠĠĠ', 'ĠString', '.', 'format', '("', 'Typ', 'ical', 'Ġbroker', 'Ġid', 'Ġmust', 'Ġbe', 'Ġspecified', 'Ġwith', 'Ġits', 'Ġcapacity', 'Ġ(', 'Id', ':%', 's', 'ĠCapacity', ':%', 's', ')."', ',', 'Ċ', 'ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġbuilder', '._', '

In [16]:
tokenizer.convert_tokens_to_string(tokens)

'<s>Generate a unit test case for the following Java method: static void sanityCheckTypical(Builder builder) {\n    int numSetTypical = builder._typicalBrokerId != DEFAULT_OPTIONAL_INT ? 1 : 0;\n    if (builder._typicalBrokerCapacity != DEFAULT_OPTIONAL_DOUBLE) {\n      numSetTypical++;\n    }\n\n    if (numSetTypical == 1) {\n      throw new IllegalArgumentException(\n          String.format("Typical broker id must be specified with its capacity (Id:%s Capacity:%s).",\n                        builder._typicalBrokerId == DEFAULT_OPTIONAL_INT ? "-" : String.valueOf(builder._typicalBrokerId),\n                        builder._typicalBrokerCapacity == DEFAULT_OPTIONAL_DOUBLE ? "-" : String.valueOf(builder._typicalBrokerCapacity)));\n    } else if (numSetTypical == 2) {\n      if (builder._numBrokers == DEFAULT_OPTIONAL_INT) {\n        throw new IllegalArgumentException("Typical broker id and capacity cannot be specified without number of brokers.");\n      } else if (builder._resource == 

In [17]:
print(f"Vocab size : {tokenizer.vocab_size}")
print(f"max length : {tokenizer.model_max_length}")
print(f"model input : {tokenizer.model_input_names}")

Vocab size : 32100
max length : 512
model input : ['input_ids', 'attention_mask']


In [18]:
batch = tokenizer(train['instruction'][0],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
print(batch)

{'input_ids': tensor([[    1,  4625,   279,  2836,  1842,   648,   364,   326,  3751,  5110,
           707,    30,   760,   918, 16267,  1564, 18488,  1706,    12,  1263,
          2089,    13,   288,   203,   565,   509,   818,   694, 18488,  1706,
           273,  2089,  6315, 12846,  1706, 11194,   548,   480,  3331,    67,
         14165,    67,  3217,   692,   404,   294,   374,    31,   203,   565,
           309,   261,  9574,  6315, 12846,  1706, 11194,  7437,   480,  3331,
            67, 14165,    67, 17088,    13,   288,   203,  1377,   818,   694,
         18488,  1706,  9904,    31,   203,   565,   289,   203,   203,   565,
           309,   261,  2107,   694, 18488,  1706,   422,   404,    13,   288,
           203,  1377,   604,   394,  2754,    12,   203,  1850,   514,    18,
          2139,  2932, 18488,  1706,  8625,   612,  1297,   506,  1269,   598,
          2097,  7519,   261,   548,  5319,    87, 27294,  5319,    87, 14944,
            16,   203, 13491,  2089,  

# Tokenizing Dataset

In [19]:
def tokenize_data(data):
  input_col=tokenizer(data['instruction'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
  target_col=tokenizer(data['output'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")

  return {
      "input_ids":input_col["input_ids"],
      "attention_mask":input_col["attention_mask"],
      "labels":target_col["input_ids"]
  }

In [20]:
print("Tokenizing dataset...")

Tokenizing dataset...


In [21]:
train = train.shuffle(True).select(range(5000))  # seleting 5k dataset
print(train)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 5000
})


In [22]:
test = test.shuffle(True).select(range(500))  # seleting 500 dataset
print(test)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 500
})


In [23]:
print("Mapping train data...")
train=train.map(tokenize_data,batched=True)

Mapping train data...


Map: 100%|██████████| 5000/5000 [00:15<00:00, 317.08 examples/s]


In [24]:
print("Mappig test data...")
test=test.map(tokenize_data,batched=True)

Mappig test data...


Map: 100%|██████████| 500/500 [00:01<00:00, 297.65 examples/s]


In [25]:
train=train.remove_columns(["instruction","output"])
test=test.remove_columns(["instruction","output"])

In [26]:
train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5000
})

In [27]:
test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})

# Load Model

In [28]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'trainable model parameters: {trainable_model_params}\n \
            all model parameters: {all_model_params} \n \
            percentage of trainable model parameters: {(trainable_model_params / all_model_params) * 100} %'

In [47]:
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    device="cuda"
    torch_type=torch.bfloat16
else:
    device="cpu"
    torch_type=torch.bfloat16
    print("I am begging for mercy already!")

CUDA device: NVIDIA GeForce RTX 3060 Ti


In [30]:
model = T5ForConditionalGeneration.from_pretrained(base_model,device_map=device,torch_dtype=torch_type,attn_implementation="eager")

In [31]:
model.dtype

torch.bfloat16

In [32]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 222882048
             all model parameters: 222882048 
             percentage of trainable model parameters: 100.0 %


## Training args

In [33]:
print(f"BF16 support is {transformers.utils.import_utils.is_torch_bf16_gpu_available()}")   # must check

BF16 support is True


In [34]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./runs",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=4,
    bf16=False,
    fp16=False,
    tf32=True,
    optim="adamw_torch_fused",
    save_strategy="no",
    log_level="info",
    logging_strategy="steps",
    logging_steps=1000,
    logging_first_step=True,
    report_to='none'
)

In [35]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model,
)

In [36]:
trainer=Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator
)

## Trainer args

In [37]:
for n, p in trainer.model.named_parameters():
    print(n, "-->", p.dtype, p.requires_grad)

shared.weight --> torch.bfloat16 True
encoder.block.0.layer.0.SelfAttention.q.weight --> torch.bfloat16 True
encoder.block.0.layer.0.SelfAttention.k.weight --> torch.bfloat16 True
encoder.block.0.layer.0.SelfAttention.v.weight --> torch.bfloat16 True
encoder.block.0.layer.0.SelfAttention.o.weight --> torch.bfloat16 True
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight --> torch.bfloat16 True
encoder.block.0.layer.0.layer_norm.weight --> torch.bfloat16 True
encoder.block.0.layer.1.DenseReluDense.wi.weight --> torch.bfloat16 True
encoder.block.0.layer.1.DenseReluDense.wo.weight --> torch.bfloat16 True
encoder.block.0.layer.1.layer_norm.weight --> torch.bfloat16 True
encoder.block.1.layer.0.SelfAttention.q.weight --> torch.bfloat16 True
encoder.block.1.layer.0.SelfAttention.k.weight --> torch.bfloat16 True
encoder.block.1.layer.0.SelfAttention.v.weight --> torch.bfloat16 True
encoder.block.1.layer.0.SelfAttention.o.weight --> torch.bfloat16 True
encoder.block.1.layer.0

In [38]:
trainer.train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5000
})

In [39]:
trainer.eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})

In [40]:
print("Starting trainer...")

Starting trainer...


In [41]:
trainer.train()

***** Running training *****
  Num examples = 5,000
  Num Epochs = 4
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5,000
  Number of trainable parameters = 222,882,048
  0%|          | 1/5000 [00:00<1:01:09,  1.36it/s]

{'loss': 4.0938, 'grad_norm': 56.5, 'learning_rate': 9.998e-05, 'epoch': 0.0}


 20%|██        | 1000/5000 [05:34<22:04,  3.02it/s]

{'loss': 0.7739, 'grad_norm': 2.234375, 'learning_rate': 8e-05, 'epoch': 0.8}


 25%|██▌       | 1250/5000 [06:59<24:45,  2.52it/s]
***** Running Evaluation *****
  Num examples = 500
  Batch size = 4
                                                   
 25%|██▌       | 1250/5000 [07:14<24:45,  2.52it/s]

{'eval_loss': 0.5052187442779541, 'eval_runtime': 14.2882, 'eval_samples_per_second': 34.994, 'eval_steps_per_second': 8.748, 'epoch': 1.0}


 40%|████      | 2000/5000 [11:26<16:42,  2.99it/s]  

{'loss': 0.6451, 'grad_norm': 1.0859375, 'learning_rate': 6e-05, 'epoch': 1.6}


 50%|█████     | 2500/5000 [14:13<13:58,  2.98it/s]
***** Running Evaluation *****
  Num examples = 500
  Batch size = 4
                                                   
 50%|█████     | 2500/5000 [14:27<13:58,  2.98it/s]

{'eval_loss': 0.4875820279121399, 'eval_runtime': 14.138, 'eval_samples_per_second': 35.366, 'eval_steps_per_second': 8.841, 'epoch': 2.0}


 60%|██████    | 3000/5000 [17:14<11:06,  3.00it/s]  

{'loss': 0.5978, 'grad_norm': 1.4140625, 'learning_rate': 4e-05, 'epoch': 2.4}


 75%|███████▌  | 3750/5000 [21:24<06:54,  3.02it/s]
***** Running Evaluation *****
  Num examples = 500
  Batch size = 4
                                                   
 75%|███████▌  | 3750/5000 [21:38<06:54,  3.02it/s]

{'eval_loss': 0.4833085834980011, 'eval_runtime': 14.0641, 'eval_samples_per_second': 35.552, 'eval_steps_per_second': 8.888, 'epoch': 3.0}


 80%|████████  | 4000/5000 [23:02<05:34,  2.99it/s]  

{'loss': 0.6108, 'grad_norm': 1.1171875, 'learning_rate': 2e-05, 'epoch': 3.2}


100%|██████████| 5000/5000 [28:36<00:00,  3.02it/s]
***** Running Evaluation *****
  Num examples = 500
  Batch size = 4


{'loss': 0.5865, 'grad_norm': 0.96875, 'learning_rate': 0.0, 'epoch': 4.0}


                                                   
100%|██████████| 5000/5000 [28:50<00:00,  3.02it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 5000/5000 [28:50<00:00,  2.89it/s]

{'eval_loss': 0.48262110352516174, 'eval_runtime': 14.0434, 'eval_samples_per_second': 35.604, 'eval_steps_per_second': 8.901, 'epoch': 4.0}
{'train_runtime': 1730.4614, 'train_samples_per_second': 11.558, 'train_steps_per_second': 2.889, 'train_loss': 0.64347197265625, 'epoch': 4.0}





TrainOutput(global_step=5000, training_loss=0.64347197265625, metrics={'train_runtime': 1730.4614, 'train_samples_per_second': 11.558, 'train_steps_per_second': 2.889, 'total_flos': 1.21791578112e+16, 'train_loss': 0.64347197265625, 'epoch': 4.0})

In [42]:
print("finished. Saving model...")
model.save_pretrained(new_model)
tokenizer.save_pretrained(tokenizer_path)

Configuration saved in CODEX-codet5-base\config.json
Configuration saved in CODEX-codet5-base\generation_config.json


finished. Saving model...


Model weights saved in CODEX-codet5-base\model.safetensors
tokenizer config file saved in tokenizer\tokenizer_config.json
Special tokens file saved in tokenizer\special_tokens_map.json


('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\vocab.json',
 'tokenizer\\merges.txt',
 'tokenizer\\added_tokens.json')

# Inference from fine-tuned model

In [43]:
model = T5ForConditionalGeneration.from_pretrained(new_model,device_map=device)
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

loading configuration file CODEX-codet5-base\config.json
Model config T5Config {
  "_name_or_path": "Salesforce/codet5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 1,
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max

In [44]:
def generate_unit_tests(instruction):
    
  inputs = tokenizer(instruction, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
  inputs = {key: value.to(device) for key, value in inputs.items()}

  outputs = model.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=512,
      num_beams=5,
      do_sample=True,
      temperature=0.7,
      top_k=100,
      top_p=0.9,
      no_repeat_ngram_size=5,
      repetition_penalty=1.5,
      length_penalty=1.0,
      early_stopping=True
  )

  generated_test = tokenizer.decode(outputs[0], skip_special_tokens=True)

  return generated_test

In [45]:
instruction = """
public class SimpleCalculator {
    // Method to add two numbers
    public int add(int a, int b) {
        return a + b;
    }

    // Method to subtract two numbers
    public int subtract(int a, int b) {
        return a - b;
    }

    // Method to multiply two numbers
    public int multiply(int a, int b) {
        return a * b;
    }

    // Method to divide two numbers
    // Throws ArithmeticException if divisor is zero
    public double divide(int a, int b) {
        if (b == 0) {
            throw new ArithmeticException("Cannot divide by zero");
        }
        return (double) a / b;
    }
}
"""
prompt="Generate a unit test case for the following Java method: "+instruction

In [48]:
generated_test = generate_unit_tests(prompt)
print(generated_test)

The unit test case for the given Java method is: @Test
  public void testDivideByZero() {
    // Given:
    SimpleCalculator calculator = new SimpleCalculator();
    calculator.divide(1, 2);
    }


In [49]:
print('END')

END
