In [1]:
# %pip install -U transformers datasets huggingface_hub tqdm
# --OR--
# %pip install -r requirements.txt

In [3]:
import os
import json
import torch
import transformers
from datasets import load_dataset
from huggingface_hub import login
from transformers import RobertaTokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

In [6]:
base_model = "Salesforce/codet5-base"
new_model = "CODEX-codet5-base"
tokenizer_path = "tokenizer"
dataset_name = "CodexAI/Deepseek-Coder"

In [5]:
login('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',add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\walim\.cache\huggingface\token
Login successful


# Dataset
Load the dataset using `load_dataset()` but the dataset must be in `.parquet` format.
or else clone the dataset repo from HF, it's fast as fuck!

In [7]:
dataset = load_dataset(dataset_name)

Downloading readme:   0%|          | 0.00/421 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/61.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/75000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3534 [00:00<?, ? examples/s]

In [8]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 75000
    })
    test: Dataset({
        features: ['instruction', 'output'],
        num_rows: 3534
    })
})


In [9]:
train=dataset['train']
print(train)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 75000
})


In [10]:
test=dataset['test']
print(test)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 3534
})


## Inspecting dataset instance
Here dataset instance is printed just to see how the dataset looks like, skip these steps if your are **gay**

In [11]:
print(train['instruction'][0])

Generate a unit test case for the following Java method: EhCoversFF extends GenericFilterFunction { @Override protected boolean relate(GeometryWrapper sourceGeometry, GeometryWrapper targetGeometry) throws FactoryException, MismatchedDimensionException, TransformException { return sourceGeometry.relate(targetGeometry, EgenhoferIntersectionPattern.COVERS); }  @Override boolean isDisjoint(); @Override boolean isDisconnected();  }


In [12]:
print(train['output'][0])

The unit test case for the given Java method is: @Test public void testRelate_polygon_polygon_false() throws FactoryException, MismatchedDimensionException, TransformException { GeometryWrapper subjectGeometryWrapper = GeometryWrapper.extract(ResourceFactory.createTypedLiteral("<http: GeometryWrapper objectGeometryWrapper = GeometryWrapper.extract(ResourceFactory.createTypedLiteral("<http: EhCoversFF instance = new EhCoversFF(); Boolean expResult = false; Boolean result = instance.relate(subjectGeometryWrapper, objectGeometryWrapper); assertEquals(expResult, result); }


In [13]:
print("Loading tokenizer...")
tokenizer = RobertaTokenizer.from_pretrained(base_model)

Loading tokenizer...




In [14]:
instruction = tokenizer(train['instruction'][0])
print(instruction)

{'input_ids': [1, 4625, 279, 2836, 1842, 648, 364, 326, 3751, 5110, 707, 30, 512, 76, 39, 23042, 2246, 3231, 7928, 1586, 2083, 288, 632, 6618, 4750, 1250, 1279, 340, 12, 9823, 3611, 1084, 9823, 16, 8344, 3611, 1018, 9823, 13, 1216, 7822, 503, 16, 26454, 8611, 503, 16, 11514, 503, 288, 327, 1084, 9823, 18, 2878, 340, 12, 3299, 9823, 16, 512, 4507, 15008, 586, 23634, 3234, 18, 3865, 21510, 1769, 289, 225, 632, 6618, 1250, 353, 1669, 16452, 5621, 632, 6618, 1250, 353, 26303, 5621, 225, 289, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [15]:
tokens = tokenizer.convert_ids_to_tokens(instruction.input_ids)
print(tokens)

['<s>', 'Generate', 'Ġa', 'Ġunit', 'Ġtest', 'Ġcase', 'Ġfor', 'Ġthe', 'Ġfollowing', 'ĠJava', 'Ġmethod', ':', 'ĠE', 'h', 'C', 'overs', 'FF', 'Ġextends', 'ĠGeneric', 'Filter', 'Function', 'Ġ{', 'Ġ@', 'Override', 'Ġprotected', 'Ġboolean', 'Ġrel', 'ate', '(', 'Geometry', 'Wrapper', 'Ġsource', 'Geometry', ',', 'ĠGeometry', 'Wrapper', 'Ġtarget', 'Geometry', ')', 'Ġthrows', 'ĠFactory', 'Exception', ',', 'ĠMismatched', 'Dimension', 'Exception', ',', 'ĠTransform', 'Exception', 'Ġ{', 'Ġreturn', 'Ġsource', 'Geometry', '.', 'rel', 'ate', '(', 'target', 'Geometry', ',', 'ĠE', 'gen', 'ho', 'fer', 'Intersection', 'Pattern', '.', 'CO', 'VERS', ');', 'Ġ}', 'Ġ', 'Ġ@', 'Override', 'Ġboolean', 'Ġis', 'Dis', 'joint', '();', 'Ġ@', 'Override', 'Ġboolean', 'Ġis', 'Disconnected', '();', 'Ġ', 'Ġ}', '</s>']


In [16]:
tokenizer.convert_tokens_to_string(tokens)

'<s>Generate a unit test case for the following Java method: EhCoversFF extends GenericFilterFunction { @Override protected boolean relate(GeometryWrapper sourceGeometry, GeometryWrapper targetGeometry) throws FactoryException, MismatchedDimensionException, TransformException { return sourceGeometry.relate(targetGeometry, EgenhoferIntersectionPattern.COVERS); }  @Override boolean isDisjoint(); @Override boolean isDisconnected();  }</s>'

In [17]:
print(f"Vocab size : {tokenizer.vocab_size}")
print(f"max length : {tokenizer.model_max_length}")
print(f"model input : {tokenizer.model_input_names}")

Vocab size : 32100
max length : 512
model input : ['input_ids', 'attention_mask']


In [18]:
batch = tokenizer(train['instruction'][0],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
print(batch)

{'input_ids': tensor([[    1,  4625,   279,  2836,  1842,   648,   364,   326,  3751,  5110,
           707,    30,   512,    76,    39, 23042,  2246,  3231,  7928,  1586,
          2083,   288,   632,  6618,  4750,  1250,  1279,   340,    12,  9823,
          3611,  1084,  9823,    16,  8344,  3611,  1018,  9823,    13,  1216,
          7822,   503,    16, 26454,  8611,   503,    16, 11514,   503,   288,
           327,  1084,  9823,    18,  2878,   340,    12,  3299,  9823,    16,
           512,  4507, 15008,   586, 23634,  3234,    18,  3865, 21510,  1769,
           289,   225,   632,  6618,  1250,   353,  1669, 16452,  5621,   632,
          6618,  1250,   353, 26303,  5621,   225,   289,     2,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

# Tokenizing Dataset

In [19]:
def tokenize_data(data):
  input_col=tokenizer(data['instruction'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
  target_col=tokenizer(data['output'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")

  return {
      "input_ids":input_col["input_ids"],
      "attention_mask":input_col["attention_mask"],
      "labels":target_col["input_ids"]
  }

In [20]:
print("Tokenizing dataset...")

Tokenizing dataset...


In [None]:
train = train.select(range(10000))  # seleting 10k dataset, you dont have to
print(train)

In [21]:
print("Mapping train data...")
train=train.map(tokenize_data,batched=True)

Mapping train data...


Map:   0%|          | 0/75000 [00:00<?, ? examples/s]

In [22]:
print(train)

Dataset({
    features: ['instruction', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 75000
})


In [23]:
print("Mappig test data...")
test=test.map(tokenize_data,batched=True)

Mappig test data...


Map:   0%|          | 0/3534 [00:00<?, ? examples/s]

In [24]:
print(test)

Dataset({
    features: ['instruction', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 3534
})


In [25]:
train=train.remove_columns(["instruction","output"])
test=test.remove_columns(["instruction","output"])

In [26]:
train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 75000
})

# Fine-tuning

In [29]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'trainable model parameters: {trainable_model_params}\n \
            all model parameters: {all_model_params} \n \
            percentage of trainable model parameters: {(trainable_model_params / all_model_params) * 100} %'

In [30]:
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    device={"":0}
    torch_type=torch.bfloat16
else:
    device="cpu"
    torch_type=torch.bfloat16
    print("I am begging for mercy already!")

CUDA device: NVIDIA GeForce RTX 3060 Ti


In [31]:
model = T5ForConditionalGeneration.from_pretrained(base_model,device_map=device)

In [32]:
print(model)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [33]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 222882048
             all model parameters: 222882048 
             percentage of trainable model parameters: 100.0 %


## Training args

In [34]:
print(f"BF16 support is {transformers.utils.import_utils.is_torch_bf16_gpu_available()}")   # must check

BF16 support is True


In [35]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    run_name ="./loggings",
    overwrite_output_dir=True,
    eval_strategy="steps",
    learning_rate=5e-4, # default, change to (1e-3) later
    gradient_accumulation_steps=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    # auto_find_batch_size = True, # for CUDA out of memory 
    weight_decay=0.01,
    num_train_epochs=1,
    bf16=True,
    optim="adamw_torch",
    save_strategy="no",
    log_level="info",
    logging_first_step=True,
    report_to='none' ## can be wandb, but we dont need right now!
)

In [36]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model,
)

In [37]:
trainer=Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator
)

Using auto half precision backend


In [38]:
print("Starting trainer...")

Starting trainer...


In [39]:
!nvidia-smi

Sun Sep  8 00:21:41 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti   WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   43C    P8             20W /  240W |    1903MiB /   8192MiB |     11%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [40]:
trainer.train()

***** Running training *****
  Num examples = 10,000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 1,250
  Number of trainable parameters = 222,882,048


  0%|          | 0/1250 [00:00<?, ?it/s]

{'loss': 4.256, 'grad_norm': 60.69383239746094, 'learning_rate': 0.0004996, 'epoch': 0.0}



***** Running Evaluation *****
  Num examples = 3534
  Batch size = 4


{'loss': 0.5664, 'grad_norm': 0.34722140431404114, 'learning_rate': 0.0003, 'epoch': 0.4}


  0%|          | 0/884 [00:00<?, ?it/s]

{'eval_loss': 0.4343658685684204, 'eval_runtime': 360.5775, 'eval_samples_per_second': 9.801, 'eval_steps_per_second': 2.452, 'epoch': 0.4}



***** Running Evaluation *****
  Num examples = 3534
  Batch size = 4


{'loss': 0.444, 'grad_norm': 1.478962779045105, 'learning_rate': 0.0001, 'epoch': 0.8}


  0%|          | 0/884 [00:00<?, ?it/s]

{'eval_loss': 0.3736537992954254, 'eval_runtime': 360.4655, 'eval_samples_per_second': 9.804, 'eval_steps_per_second': 2.452, 'epoch': 0.8}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 16639.2862, 'train_samples_per_second': 0.601, 'train_steps_per_second': 0.075, 'train_loss': 0.48824106483459473, 'epoch': 1.0}


TrainOutput(global_step=1250, training_loss=0.48824106483459473, metrics={'train_runtime': 16639.2862, 'train_samples_per_second': 0.601, 'train_steps_per_second': 0.075, 'total_flos': 6089578905600000.0, 'train_loss': 0.48824106483459473, 'epoch': 1.0})

In [41]:
print("finished. Saving model...")
model.save_pretrained(new_model)
tokenizer.save_pretrained(tokenizer_path)

Configuration saved in CODEX-codet5-base\config.json
Configuration saved in CODEX-codet5-base\generation_config.json


finished. Saving model...


Model weights saved in CODEX-codet5-base\model.safetensors
tokenizer config file saved in tokenizer\tokenizer_config.json
Special tokens file saved in tokenizer\special_tokens_map.json


('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\vocab.json',
 'tokenizer\\merges.txt',
 'tokenizer\\added_tokens.json')

# Loading fine-tuned Model and Tokenizer

In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [47]:
model = T5ForConditionalGeneration.from_pretrained(new_model,device_map=device)

loading configuration file CODEX-codet5-base\config.json
Model config T5Config {
  "_name_or_path": "Salesforce/codet5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 1,
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max

In [48]:
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json


# Inference

In [49]:
def generate_unit_tests(instruction):
    
  inputs = tokenizer(instruction, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
  inputs = {key: value.to(device) for key, value in inputs.items()}

  outputs = model.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=512,
      num_beams=5,
      do_sample=True,  # Enable sampling for diverse output
      temperature=0.2,  # Control randomness
      top_k=100,  # Limit the sampling pool to top K tokens
      top_p=0.9,
      no_repeat_ngram_size=5,
      repetition_penalty=1.5,
      length_penalty=1.0,
      early_stopping=True
  )

  # Decode the generated output
  generated_test = tokenizer.decode(outputs[0], skip_special_tokens=True)

  return generated_test

In [52]:
instruction = """
public class SimpleCalculator {
    // Method to add two numbers
    public int add(int a, int b) {
        return a + b;
    }

    // Method to subtract two numbers
    public int subtract(int a, int b) {
        return a - b;
    }

    // Method to multiply two numbers
    public int multiply(int a, int b) {
        return a * b;
    }

    // Method to divide two numbers
    // Throws ArithmeticException if divisor is zero
    public double divide(int a, int b) {
        if (b == 0) {
            throw new ArithmeticException("Cannot divide by zero");
        }
        return (double) a / b;
    }
}
"""
prompt="Generate a unit test case for the following Java method: "+instruction
# print(prompt)

In [51]:
generated_test = generate_unit_tests(prompt)
print(generated_test)

The unit test case for the given Java method is: @Test(expected = ArithmeticException.class) public void testDivideByZero() { SimpleCalculator calculator = new SimpleCalculator(); calculator.divide(1, 2); }


# Push to HF


In [53]:
from huggingface_hub import HfApi, create_repo

In [54]:
repo_name = new_model
organization_name = "CodexAI"
repo_url = f"{organization_name}/{repo_name}"

In [55]:
create_repo(repo_url, repo_type="model", private=True,exist_ok=True)

RepoUrl('https://huggingface.co/CodexAI/CODEX-codet5-base', endpoint='https://huggingface.co', repo_type='model', repo_id='CodexAI/CODEX-codet5-base')

use the `push_to_hub()` , but its shit

In [1]:
# model.push_to_hub(repo_url, private=True)
# tokenizer.push_to_hub(repo_url, private=True)

If the above method throws `Bad request for commit endpoint`,`Runtime disscounted` or something other error then try this one:

In [57]:
api = HfApi()
api.upload_folder(folder_path=new_model,repo_id=repo_url)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CodexAI/CODEX-codet5-base/commit/11e94fa93cf964c7bd563336288b8bc0c5478cd1', commit_message='Upload folder using huggingface_hub', commit_description='', oid='11e94fa93cf964c7bd563336288b8bc0c5478cd1', pr_url=None, pr_revision=None, pr_num=None)

In [58]:
print(f"Model and Tokenizer saved at {repo_url}")

Model and Tokenizer saved at CodexAI/CODEX-codet5-base
