In [1]:
%pip install -U transformers datasets accelerate huggingface_hub
# --OR--
# !pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import torch
import transformers
from huggingface_hub import login
from datasets import load_dataset, Dataset
from transformers import RobertaTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq


In [3]:
base_model = "Salesforce/codet5-large"

new_model = "CODEX-codet5-large"

tokenizer_path = "tokenizer"

dataset_name = "CodexAI/dataset"

In [4]:
login('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',add_to_git_credential=True)

Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Dataset
Load the dataset using `load_dataset()` but the dataset must be in `.parquet` format.
or else clone the dataset repo from HF, it's fast as fuck!

In [5]:
dataset = load_dataset(dataset_name)

In [6]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['focal_method', 'test_case'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['focal_method', 'test_case'],
        num_rows: 4687
    })
})


In [7]:
train=dataset['train']
test=dataset['test']

In [8]:
train = {
    "instruction": [f"Generate a unit test case for the following Java method: {item['focal_method']}" for item in train],
    "output": [f"The unit test case for the given Java method is: {item['test_case']}" for item in train]
}

In [9]:
test = {
    "instruction": [f"Generate a unit test case for the following Java method: {item['focal_method']}" for item in test],
    "output": [f"The unit test case for the given Java method is: {item['test_case']}" for item in test]
}

## Inspecting dataset instance
Here dataset instance is printed just to see how the dataset looks like, skip these steps if your are **gay**

In [10]:
print(train['instruction'][0])

Generate a unit test case for the following Java method: static void sanityCheckTypical(Builder builder) {
    int numSetTypical = builder._typicalBrokerId != DEFAULT_OPTIONAL_INT ? 1 : 0;
    if (builder._typicalBrokerCapacity != DEFAULT_OPTIONAL_DOUBLE) {
      numSetTypical++;
    }

    if (numSetTypical == 1) {
      throw new IllegalArgumentException(
          String.format("Typical broker id must be specified with its capacity (Id:%s Capacity:%s).",
                        builder._typicalBrokerId == DEFAULT_OPTIONAL_INT ? "-" : String.valueOf(builder._typicalBrokerId),
                        builder._typicalBrokerCapacity == DEFAULT_OPTIONAL_DOUBLE ? "-" : String.valueOf(builder._typicalBrokerCapacity)));
    } else if (numSetTypical == 2) {
      if (builder._numBrokers == DEFAULT_OPTIONAL_INT) {
        throw new IllegalArgumentException("Typical broker id and capacity cannot be specified without number of brokers.");
      } else if (builder._resource == null) {
        th

In [11]:
print(train['output'][0])

The unit test case for the given Java method is: @Test
  public void testSanityCheckTypical() {
    // Set a typical broker id without its capacity.
    assertThrows(IllegalArgumentException.class, () -> ProvisionRecommendation.sanityCheckTypical(
        new ProvisionRecommendation.Builder(ProvisionStatus.UNDER_PROVISIONED).numBrokers(1).typicalBrokerId(1).resource(Resource.CPU)));

    // Skip setting numBrokers.
    assertThrows(IllegalArgumentException.class, () -> ProvisionRecommendation.sanityCheckTypical(
        new ProvisionRecommendation.Builder(ProvisionStatus.UNDER_PROVISIONED).typicalBrokerId(1).typicalBrokerCapacity(1.0)
                                                                              .resource(Resource.CPU)));

    // Skip setting resource.
    assertThrows(IllegalArgumentException.class, () -> ProvisionRecommendation.sanityCheckTypical(
        new ProvisionRecommendation.Builder(ProvisionStatus.UNDER_PROVISIONED).typicalBrokerId(1).typicalBrokerCapacity(1.

In [12]:
train = Dataset.from_dict(train)
test = Dataset.from_dict(test)

In [13]:
train

Dataset({
    features: ['instruction', 'output'],
    num_rows: 100000
})

In [14]:
test

Dataset({
    features: ['instruction', 'output'],
    num_rows: 4687
})

In [15]:
print("Loading tokenizer...")
tokenizer = RobertaTokenizer.from_pretrained(base_model)

Loading tokenizer...




In [16]:
instruction = tokenizer(train['instruction'][0])
print(instruction)

{'input_ids': [1, 4625, 279, 2836, 1842, 648, 364, 326, 3751, 5110, 707, 30, 760, 918, 16267, 1564, 18488, 1706, 12, 1263, 2089, 13, 288, 203, 565, 509, 818, 694, 18488, 1706, 273, 2089, 6315, 12846, 1706, 11194, 548, 480, 3331, 67, 14165, 67, 3217, 692, 404, 294, 374, 31, 203, 565, 309, 261, 9574, 6315, 12846, 1706, 11194, 7437, 480, 3331, 67, 14165, 67, 17088, 13, 288, 203, 1377, 818, 694, 18488, 1706, 9904, 31, 203, 565, 289, 203, 203, 565, 309, 261, 2107, 694, 18488, 1706, 422, 404, 13, 288, 203, 1377, 604, 394, 2754, 12, 203, 1850, 514, 18, 2139, 2932, 18488, 1706, 8625, 612, 1297, 506, 1269, 598, 2097, 7519, 261, 548, 5319, 87, 27294, 5319, 87, 14944, 16, 203, 13491, 2089, 6315, 12846, 1706, 11194, 548, 422, 3331, 67, 14165, 67, 3217, 692, 7514, 294, 514, 18, 1132, 951, 12, 9574, 6315, 12846, 1706, 11194, 548, 3631, 203, 13491, 2089, 6315, 12846, 1706, 11194, 7437, 422, 3331, 67, 14165, 67, 17088, 692, 7514, 294, 514, 18, 1132, 951, 12, 9574, 6315, 12846, 1706, 11194, 7437, 3719,

In [17]:
tokens = tokenizer.convert_ids_to_tokens(instruction.input_ids)
print(tokens)

['<s>', 'Generate', 'Ġa', 'Ġunit', 'Ġtest', 'Ġcase', 'Ġfor', 'Ġthe', 'Ġfollowing', 'ĠJava', 'Ġmethod', ':', 'Ġstatic', 'Ġvoid', 'Ġsanity', 'Check', 'Typ', 'ical', '(', 'Builder', 'Ġbuilder', ')', 'Ġ{', 'Ċ', 'ĠĠĠ', 'Ġint', 'Ġnum', 'Set', 'Typ', 'ical', 'Ġ=', 'Ġbuilder', '._', 'typ', 'ical', 'Broker', 'Id', 'Ġ!=', 'ĠDEFAULT', '_', 'OPTIONAL', '_', 'INT', 'Ġ?', 'Ġ1', 'Ġ:', 'Ġ0', ';', 'Ċ', 'ĠĠĠ', 'Ġif', 'Ġ(', 'builder', '._', 'typ', 'ical', 'Broker', 'Capacity', 'Ġ!=', 'ĠDEFAULT', '_', 'OPTIONAL', '_', 'DOUBLE', ')', 'Ġ{', 'Ċ', 'ĠĠĠĠĠ', 'Ġnum', 'Set', 'Typ', 'ical', '++', ';', 'Ċ', 'ĠĠĠ', 'Ġ}', 'Ċ', 'Ċ', 'ĠĠĠ', 'Ġif', 'Ġ(', 'num', 'Set', 'Typ', 'ical', 'Ġ==', 'Ġ1', ')', 'Ġ{', 'Ċ', 'ĠĠĠĠĠ', 'Ġthrow', 'Ġnew', 'ĠIllegalArgumentException', '(', 'Ċ', 'ĠĠĠĠĠĠĠĠĠ', 'ĠString', '.', 'format', '("', 'Typ', 'ical', 'Ġbroker', 'Ġid', 'Ġmust', 'Ġbe', 'Ġspecified', 'Ġwith', 'Ġits', 'Ġcapacity', 'Ġ(', 'Id', ':%', 's', 'ĠCapacity', ':%', 's', ')."', ',', 'Ċ', 'ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ', 'Ġbuilder', '._', '

In [18]:
tokenizer.convert_tokens_to_string(tokens)

'<s>Generate a unit test case for the following Java method: static void sanityCheckTypical(Builder builder) {\n    int numSetTypical = builder._typicalBrokerId != DEFAULT_OPTIONAL_INT ? 1 : 0;\n    if (builder._typicalBrokerCapacity != DEFAULT_OPTIONAL_DOUBLE) {\n      numSetTypical++;\n    }\n\n    if (numSetTypical == 1) {\n      throw new IllegalArgumentException(\n          String.format("Typical broker id must be specified with its capacity (Id:%s Capacity:%s).",\n                        builder._typicalBrokerId == DEFAULT_OPTIONAL_INT ? "-" : String.valueOf(builder._typicalBrokerId),\n                        builder._typicalBrokerCapacity == DEFAULT_OPTIONAL_DOUBLE ? "-" : String.valueOf(builder._typicalBrokerCapacity)));\n    } else if (numSetTypical == 2) {\n      if (builder._numBrokers == DEFAULT_OPTIONAL_INT) {\n        throw new IllegalArgumentException("Typical broker id and capacity cannot be specified without number of brokers.");\n      } else if (builder._resource == 

In [19]:
print(f"Vocab size : {tokenizer.vocab_size}")
print(f"max length : {tokenizer.model_max_length}")
print(f"model input : {tokenizer.model_input_names}")

Vocab size : 32100
max length : 512
model input : ['input_ids', 'attention_mask']


In [20]:
batch = tokenizer(train['instruction'][0],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
print(batch)

{'input_ids': tensor([[    1,  4625,   279,  2836,  1842,   648,   364,   326,  3751,  5110,
           707,    30,   760,   918, 16267,  1564, 18488,  1706,    12,  1263,
          2089,    13,   288,   203,   565,   509,   818,   694, 18488,  1706,
           273,  2089,  6315, 12846,  1706, 11194,   548,   480,  3331,    67,
         14165,    67,  3217,   692,   404,   294,   374,    31,   203,   565,
           309,   261,  9574,  6315, 12846,  1706, 11194,  7437,   480,  3331,
            67, 14165,    67, 17088,    13,   288,   203,  1377,   818,   694,
         18488,  1706,  9904,    31,   203,   565,   289,   203,   203,   565,
           309,   261,  2107,   694, 18488,  1706,   422,   404,    13,   288,
           203,  1377,   604,   394,  2754,    12,   203,  1850,   514,    18,
          2139,  2932, 18488,  1706,  8625,   612,  1297,   506,  1269,   598,
          2097,  7519,   261,   548,  5319,    87, 27294,  5319,    87, 14944,
            16,   203, 13491,  2089,  

# Tokenizing Dataset

In [21]:
def tokenize_data(data):
  input_col=tokenizer(data['instruction'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
  target_col=tokenizer(data['output'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")

  return {
      "input_ids":input_col["input_ids"],
      "attention_mask":input_col["attention_mask"],
      "labels":target_col["input_ids"]
  }

In [22]:
print("Tokenizing dataset...")

Tokenizing dataset...


In [23]:
train = train.shuffle(True).select(range(1000))  # seleting 1k dataset
print(train)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 1000
})


In [24]:
print("Mapping train data...")
train=train.map(tokenize_data,batched=True)

Mapping train data...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [26]:
print("Mappig test data...")
test=test.map(tokenize_data,batched=True)

Mappig test data...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [27]:
train=train.remove_columns(["instruction","output"])
test=test.remove_columns(["instruction","output"])

In [28]:
train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [29]:
test

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})

# Load Model

In [30]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'trainable model parameters: {trainable_model_params}\n \
            all model parameters: {all_model_params} \n \
            percentage of trainable model parameters: {(trainable_model_params / all_model_params) * 100} %'

In [31]:
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    device={"":0}
    torch_type=torch.bfloat16
else:
    device="cpu"
    torch_type=torch.bfloat16
    print("I am begging for mercy already!")

CUDA device: Tesla P100-PCIE-16GB


In [32]:
model = T5ForConditionalGeneration.from_pretrained(base_model,device_map=device,torch_dtype=torch_type,attn_implementation="eager")

In [33]:
model.dtype

torch.bfloat16

In [34]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 737639424
             all model parameters: 737639424 
             percentage of trainable model parameters: 100.0 %


## Training args

In [35]:
print(f"BF16 support is {transformers.utils.import_utils.is_torch_bf16_gpu_available()}")

BF16 support is True


In [36]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    run_name ="./loggings",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=5e-4,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    num_train_epochs=1,
    bf16=True,
    fp16=False,
    optim="adamw_torch_fused",
    save_strategy="no",
    log_level="info",
    logging_strategy="steps",
    logging_steps=50,
    logging_first_step=True,
    disable_tqdm=False,
    report_to='none'
)

In [37]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model,
)

In [38]:
trainer=Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator
)

Using auto half precision backend


## Trainer args

In [39]:
for n, p in trainer.model.named_parameters():
    print(n, "-->", p.dtype, p.requires_grad)

In [40]:
trainer.train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [41]:
trainer.eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})

In [42]:
print("Starting trainer...")

Starting trainer...


In [43]:
trainer.train()

***** Running training *****
  Num examples = 1,000
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 125
  Number of trainable parameters = 737,639,424


Epoch,Training Loss,Validation Loss
1,0.6946,0.526877



***** Running Evaluation *****
  Num examples = 100
  Batch size = 2


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=125, training_loss=1.2310110321044923, metrics={'train_runtime': 640.7618, 'train_samples_per_second': 1.561, 'train_steps_per_second': 0.195, 'total_flos': 2165047296000000.0, 'train_loss': 1.2310110321044923, 'epoch': 1.0})

In [44]:
print("finished. Saving model...")
model.save_pretrained(new_model)
tokenizer.save_pretrained(tokenizer_path)

Configuration saved in CODEX-codet5-large/config.json
Configuration saved in CODEX-codet5-large/generation_config.json


finished. Saving model...


Model weights saved in CODEX-codet5-large/model.safetensors
tokenizer config file saved in tokenizer/tokenizer_config.json
Special tokens file saved in tokenizer/special_tokens_map.json


('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.json',
 'tokenizer/merges.txt',
 'tokenizer/added_tokens.json')

# Load fine-tuned Model

In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [46]:
model = T5ForConditionalGeneration.from_pretrained(new_model,device_map=device)

loading configuration file CODEX-codet5-large/config.json
Model config T5Config {
  "_name_or_path": "Salesforce/codet5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 1,
  "classifier_dropout": 0.0,
  "d_ff": 4096,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": 

In [47]:
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json


# Inference

In [48]:
def generate_unit_tests(instruction):
    
  inputs = tokenizer(instruction, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
  inputs = {key: value.to(device) for key, value in inputs.items()}

  outputs = model.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=512,
      num_beams=5,
      do_sample=True,
      temperature=0.7,
      top_k=100,
      top_p=0.9,
      no_repeat_ngram_size=5,
      repetition_penalty=1.5,
      length_penalty=1.0,
      early_stopping=True
  )

  generated_test = tokenizer.decode(outputs[0], skip_special_tokens=True)

  return generated_test

In [49]:
instruction = """
public class SimpleCalculator {
    // Method to add two numbers
    public int add(int a, int b) {
        return a + b;
    }

    // Method to subtract two numbers
    public int subtract(int a, int b) {
        return a - b;
    }

    // Method to multiply two numbers
    public int multiply(int a, int b) {
        return a * b;
    }

    // Method to divide two numbers
    // Throws ArithmeticException if divisor is zero
    public double divide(int a, int b) {
        if (b == 0) {
            throw new ArithmeticException("Cannot divide by zero");
        }
        return (double) a / b;
    }
}
"""
prompt="Generate a unit test case for the following Java method: "+instruction
# print(prompt)

In [50]:
generated_test = generate_unit_tests(prompt)
print(generated_test)

The unit test case for the given Java method is: @Test
    public void testDivide() {
        int a = 1;
        int b = 0;
        assertEquals(a, b);
    }


# Push to HF


In [51]:
from huggingface_hub import HfApi, create_repo

In [52]:
repo_url = f"CodexAI/{new_model}"

In [53]:
create_repo(repo_url, repo_type="model", private=True,exist_ok=True)

RepoUrl('https://huggingface.co/CodexAI/CODEX-codet5-large', endpoint='https://huggingface.co', repo_type='model', repo_id='CodexAI/CODEX-codet5-large')

In [54]:
api = HfApi()
api.upload_folder(folder_path=new_model,repo_id=repo_url)

model.safetensors:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CodexAI/CODEX-codet5-large/commit/fbfaf76bd7ad5333484444c864813890c137c1ba', commit_message='Upload folder using huggingface_hub', commit_description='', oid='fbfaf76bd7ad5333484444c864813890c137c1ba', pr_url=None, repo_url=RepoUrl('https://huggingface.co/CodexAI/CODEX-codet5-large', endpoint='https://huggingface.co', repo_type='model', repo_id='CodexAI/CODEX-codet5-large'), pr_revision=None, pr_num=None)

In [55]:
print(f"Model and Tokenizer saved at {repo_url}")

Model and Tokenizer saved at CodexAI/CODEX-codet5-large


In [56]:
print('END')

END
