In [1]:
# !pip install -U transformers datasets sentencepiece peft accelerate evaluate
# --OR--
# !pip install -r requirements.txt

In [1]:
import os
import json
import torch
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from evaluate import load
from datasets import Dataset
from huggingface_hub import login, Repository
from transformers import (
    RobertaTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel, 
    PeftConfig
)

# Define Variables

In [3]:
base_model = "Salesforce/codet5-base" # actual model

new_model = "CODEX-codet5-base" # name of the new fine-tuned model

tokenizer_path = "tokenizer"

dataset_path = "dataset"  # dataset dir path

dataset = "CodexAI/Deepseek-Coder"  # dataset name at huggingface

repo_url = f'https://huggingface.co/datasets/{dataset}'

In [5]:
login('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',add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\walim\.cache\huggingface\token
Login successful


# Get Dataset
Clone the dataset from HF, it's fast as fuck!

In [6]:
repo = Repository(local_dir=dataset_path,clone_from=repo_url)
repo.git_pull()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
d:\FYP\CODEX-FINETUNNING\codeT5\dataset is already a clone of https://huggingface.co/datasets/CodexAI/Deepseek-Coder. Make sure you pull the latest changes with `repo.git_pull()`.


# Playing with Dataset

In [7]:
def load_json_data(dir_name):

  data=[]
  for root_folder in os.listdir(dir_name):
    if root_folder!=".git" and root_folder!=".gitattributes":
      for files in os.listdir(os.path.join(dir_name,root_folder)):
        if files.endswith(".json"):
          with open(os.path.join(dir_name,root_folder,files),"r")as f:
            json_file=json.load(f)
            data.append(json_file)
  return data

In [8]:
print(f"Loading dataset from ./{dataset_path}/")
json_data=load_json_data(dataset_path)

Loading dataset from ./dataset/


In [9]:
print(f"Length of loaded dataset is: {len(json_data)}")

Length of loaded dataset is: 40318


In [10]:
tmp=json_data  # in case if this is required again

## Dataset Limit = 1000
Dataset limit is set to 1000 and this bcz of testing this script. For actual training change this value
`json_data[:1000]` to something greater or simply comment the cell below to use the complete dataset

In [12]:
json_data=json_data[:1000]
print(f"Length of dataset is: {len(json_data)}")

Length of dataset is: 1000


In [13]:
print("Loading dataset...")
df=Dataset.from_list(json_data)

Loading dataset...


## Inspecting dataset instance
Here dataset instance are printed just to see the dataset, skip these steps bcz you like to skip steps

In [14]:
print(df)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 1000
})


In [15]:
df.features

{'instruction': Value(dtype='string', id=None),
 'output': Value(dtype='string', id=None)}

In [16]:
print(df['instruction'][0])

Generate a unit test case for the following Java method: IntervalMap implements TimeMap<Interval, T> { @Override public boolean put(Interval interval, T value) { if (value == null) { throw new NullPointerException(); } Object values = getValuesArray(); int valuesLength = Array.getLength(values); final int index = putInner(interval.getLow(), interval.getHigh()); if (index < 0) { int insertIndex = -index - 1; if (size - 1 < valuesLength) { if (insertIndex < size - 1) { System.arraycopy(values, insertIndex, values, insertIndex + 1, size - insertIndex - 1); } Array.set(values, insertIndex, value); } else { Object newArray = Array.newInstance(values.getClass().getComponentType(), valuesLength + 1); System.arraycopy(values, 0, newArray, 0, insertIndex); System.arraycopy(values, insertIndex, newArray, insertIndex + 1, valuesLength - insertIndex); Array.set(newArray, insertIndex, value); setValuesArray(newArray); } return true; } else { Array.set(values, index, value); } return false; } Interv

In [17]:
print(df['output'][0])

The unit test case for the given Java method is: @Test public void testPutOne() { for (IntervalMap set : getAllInstances()) { Object[] defaultValues = getTestValues(set); Assert.assertTrue(set.put(new Interval(1.0, 2.0), defaultValues[0])); testValues(set, new Interval[] { new Interval(1.0, 2.0) }, new Object[] { defaultValues[0] }); } }


## train test split
If you want to evaluate the model on other dataset then load that dataset and skip these steps

In [18]:
print("Spliting dataset...")
df=df.train_test_split(test_size=0.2)

Spliting dataset...


In [19]:
print(df)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 800
    })
    test: Dataset({
        features: ['instruction', 'output'],
        num_rows: 200
    })
})


In [20]:
train=df['train']
test=df['test']

In [21]:
print(train)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 800
})


In [22]:
print(test)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 200
})


In [23]:
print("Loading tokenizer...")
tokenizer = RobertaTokenizer.from_pretrained(base_model)

Loading tokenizer...




In [24]:
instruction = tokenizer(train['instruction'][0])
print(instruction)

{'input_ids': [1, 4625, 279, 2836, 1842, 648, 364, 326, 3751, 5110, 707, 30, 3424, 774, 5852, 4597, 3231, 4115, 559, 4597, 32, 559, 4597, 12880, 16, 692, 9778, 288, 1071, 411, 56, 34, 399, 852, 12, 6385, 467, 3185, 907, 628, 13, 288, 327, 333, 18, 1458, 12, 2080, 16, 446, 16, 333, 18, 588, 1868, 3233, 559, 12, 2080, 18, 588, 797, 1435, 10019, 289, 4750, 225, 3424, 774, 5852, 4597, 5621, 632, 29282, 4312, 2932, 5847, 7923, 1412, 4597, 32, 42, 16, 399, 34, 29800, 12, 6385, 1659, 32, 42, 34, 628, 797, 16, 727, 1659, 32, 56, 34, 16065, 1769, 399, 852, 12, 6385, 467, 3185, 907, 628, 1769, 399, 852, 12, 6385, 467, 3185, 907, 628, 16, 727, 399, 358, 1769, 399, 852, 12, 6385, 467, 3185, 907, 628, 16, 727, 399, 358, 16, 727, 1659, 32, 56, 34, 16065, 1769, 399, 852, 12, 6385, 467, 3185, 907, 628, 16, 399, 358, 16, 727, 1412, 16065, 1769, 760, 727, 3424, 774, 5852, 4597, 6937, 31, 289, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [25]:
tokens = tokenizer.convert_ids_to_tokens(instruction.input_ids)
print(tokens)



In [26]:
tokenizer.convert_tokens_to_string(tokens)



In [27]:
print(f"Vocab size : {tokenizer.vocab_size}")

Vocab size : 32100


In [28]:
print(f"max length : {tokenizer.model_max_length}")

max length : 512


In [29]:
print(f"model input : {tokenizer.model_input_names}")

model input : ['input_ids', 'attention_mask']


In [30]:
batch = tokenizer(train['instruction'][0],max_length=512,truncation=True,padding="max_length",return_tensors="pt")

In [31]:
batch

{'input_ids': tensor([[    1,  4625,   279,  2836,  1842,   648,   364,   326,  3751,  5110,
           707,    30,  3424,   774,  5852,  4597,  3231,  4115,   559,  4597,
            32,   559,  4597, 12880,    16,   692,  9778,   288,  1071,   411,
            56,    34,   399,   852,    12,  6385,   467,  3185,   907,   628,
            13,   288,   327,   333,    18,  1458,    12,  2080,    16,   446,
            16,   333,    18,   588,  1868,  3233,   559,    12,  2080,    18,
           588,   797,  1435, 10019,   289,  4750,   225,  3424,   774,  5852,
          4597,  5621,   632, 29282,  4312,  2932,  5847,  7923,  1412,  4597,
            32,    42,    16,   399,    34, 29800,    12,  6385,  1659,    32,
            42,    34,   628,   797,    16,   727,  1659,    32,    56,    34,
         16065,  1769,   399,   852,    12,  6385,   467,  3185,   907,   628,
          1769,   399,   852,    12,  6385,   467,  3185,   907,   628,    16,
           727,   399,   358,  1769,  

# Tokenizing Dataset

In [32]:
def tokenize_data(data):
  input_col=tokenizer(data['instruction'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
  target_col=tokenizer(data['output'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")

  return {
      "input_ids":input_col["input_ids"],
      "attention_mask":input_col["attention_mask"],
      "labels":target_col["input_ids"]
  }

In [33]:
print("Tokenizing dataset...")

Tokenizing dataset...


In [34]:
print("Mapping train data...")
train=train.map(tokenize_data,batched=True)

Mapping train data...


Map: 100%|██████████| 800/800 [00:02<00:00, 382.34 examples/s]


In [35]:
print(train)

Dataset({
    features: ['instruction', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 800
})


In [36]:
print("Mappig test data...")
test=test.map(tokenize_data,batched=True)

Mappig test data...


Map: 100%|██████████| 200/200 [00:00<00:00, 307.67 examples/s]


In [37]:
print(test)

Dataset({
    features: ['instruction', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 200
})


In [38]:
train=train.remove_columns(["instruction","output"])
test=test.remove_columns(["instruction","output"])

In [39]:
train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 800
})

# Fine-tuning

In [40]:
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    device={"":0}
    torch_type=torch.bfloat16
else:
    device="cpu"
    torch_type=torch.bfloat16
    print("I am begging for mercy already!")

CUDA device: NVIDIA GeForce RTX 3060 Ti


In [41]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'trainable model parameters: {trainable_model_params}\n \
            all model parameters: {all_model_params} \n \
            percentage of trainable model parameters: {(trainable_model_params / all_model_params) * 100} %'

In [42]:
model = T5ForConditionalGeneration.from_pretrained(base_model,device_map=device)

In [43]:
print(model)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [44]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 222882048
             all model parameters: 222882048 
             percentage of trainable model parameters: 100.0 %


## LoRA Configuration

LoRA only fine-tunes specific layers, while full fine-tuning adapts all layers but saves CUDA memory

In [90]:
lora_config = LoraConfig(
    r=32,  # rank 16,32,64
    lora_alpha=16, # LoRA Scaling factor keep 16 or 32
    target_modules=['q', 'v'], # The modules(for example, attention blocks) to apply the LoRA update matrices.
    lora_dropout = 0.1, # 0.05
    bias='none',
    task_type=TaskType.SEQ_2_SEQ_LM ## flan-t5
)

In [91]:
peft_model = get_peft_model(model, lora_config)
print(peft_model)

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32100, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32100, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=32, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=32, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
            

In [92]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
             all model parameters: 226420992 
             percentage of trainable model parameters: 1.5629928871612753 %


## Training args

In [45]:
print(f"BF16 support is {transformers.utils.import_utils.is_torch_bf16_gpu_available()}")   # must check

BF16 support is True


In [46]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    run_name ="./loggings",
    overwrite_output_dir=True,
    eval_strategy="steps",
    learning_rate=5e-5, # default, change to (1e-3) later
    gradient_accumulation_steps=1,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
    auto_find_batch_size = True, # for CUDA out of memory 
    weight_decay=0.01,
    num_train_epochs=1,
    bf16=True,
    optim="adamw_torch",
    save_strategy="no",
    log_level="info",
    logging_first_step=True,
    report_to='none' ## can be wandb, but we dont need right now!
)

In [47]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model,
    # model=peft_model 
)

In [48]:
trainer=Seq2SeqTrainer(
    model=model, # using the base model for now
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator
)

Using auto half precision backend


In [49]:
print("Starting trainer...")

Starting trainer...


In [50]:
!nvidia-smi

Sat Sep  7 12:09:01 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti   WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   43C    P8             19W /  240W |    1864MiB /   8192MiB |      7%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [51]:
trainer.train()

***** Running training *****
  Num examples = 800
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  Number of trainable parameters = 222,882,048
  1%|          | 1/100 [00:35<59:13, 35.90s/it]

{'loss': 4.7252, 'grad_norm': 51.950950622558594, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.01}


***** Running training *****
  Num examples = 800
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Training with DataParallel so batch size has been adjusted to: 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 222,882,048
  1%|          | 1/100 [00:55<1:32:09, 55.85s/it]

[A                                            

{'loss': 1.7582, 'grad_norm': 12.24333667755127, 'learning_rate': 4.975e-05, 'epoch': 0.01}




Training completed. Do not forget to share your model on huggingface.co/models =)



100%|██████████| 200/200 [22:05<00:00,  6.63s/it]

{'train_runtime': 1325.3732, 'train_samples_per_second': 0.604, 'train_steps_per_second': 0.151, 'train_loss': 0.47638228893280027, 'epoch': 1.0}





TrainOutput(global_step=200, training_loss=0.47638228893280027, metrics={'train_runtime': 1325.3732, 'train_samples_per_second': 0.604, 'train_steps_per_second': 0.151, 'total_flos': 487166312448000.0, 'train_loss': 0.47638228893280027, 'epoch': 1.0})

In [52]:
!nvidia-smi

Sat Sep  7 12:38:50 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti   WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   42C    P8             19W /  240W |    7866MiB /   8192MiB |      3%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [53]:
print("finished. Saving model...")
model.save_pretrained(new_model)
tokenizer.save_pretrained(tokenizer_path)

Configuration saved in CODEX-codet5-base\config.json
Configuration saved in CODEX-codet5-base\generation_config.json


finished. Saving model...


Model weights saved in CODEX-codet5-base\model.safetensors
tokenizer config file saved in tokenizer\tokenizer_config.json
Special tokens file saved in tokenizer\special_tokens_map.json


('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\vocab.json',
 'tokenizer\\merges.txt',
 'tokenizer\\added_tokens.json')

In [54]:
# torch.cuda.empty_cache()  # release CUDA memory

# Load Model

## Load the actual model

If using the original model (without quant)

In [55]:
# Load model
model = T5ForConditionalGeneration.from_pretrained(new_model)

loading configuration file CODEX-codet5-base\config.json
Model config T5Config {
  "_name_or_path": "Salesforce/codet5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 1,
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max

loading weights file CODEX-codet5-base\model.safetensors
Generate config GenerationConfig {
  "bos_token_id": 1,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "pad_token_id": 0
}

All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at CODEX-codet5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.
loading configuration file CODEX-codet5-base\generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "decoder_start_token_id": 0,
  "eos_token_id": 2,
  "pad_token_id": 0
}



## Load the PEFT model

if using PEFT config then run these cells

In [107]:
# config = PeftConfig.from_pretrained("model")
# model = T5ForConditionalGeneration.from_pretrained(base_model)
# model = PeftModel.from_pretrained(model,"model",is_trainable=True)

# # check if it's working
# model.print_trainable_parameters()

In [136]:
model.eval()

# in case of PEFT, this must have (base_model)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [56]:
# ensuring the model is on either "cuda" or on "cpu"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

## Load the tokenizer

In [57]:
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json


# Inference

In [58]:
def generate_unit_tests(instruction):
    
  inputs = tokenizer(instruction, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
  inputs = {key: value.to(device) for key, value in inputs.items()}

  outputs = model.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=512,
      num_beams=5,
      do_sample=True,  # Enable sampling for diverse output
      temperature=0.2,  # Control randomness
      top_k=100,  # Limit the sampling pool to top K tokens
      top_p=0.9,
      no_repeat_ngram_size=5,
      repetition_penalty=1.5,
      length_penalty=1.0,
      early_stopping=True
  )

  # Decode the generated output
  generated_test = tokenizer.decode(outputs[0], skip_special_tokens=True)

  return generated_test

In [60]:
instruction = """
public class SimpleCalculator {
    // Method to add two numbers
    public int add(int a, int b) {
        return a + b;
    }

    // Method to subtract two numbers
    public int subtract(int a, int b) {
        return a - b;
    }

    // Method to multiply two numbers
    public int multiply(int a, int b) {
        return a * b;
    }

    // Method to divide two numbers
    // Throws ArithmeticException if divisor is zero
    public double divide(int a, int b) {
        if (b == 0) {
            throw new ArithmeticException("Cannot divide by zero");
        }
        return (double) a / b;
    }
}
"""
prompt="Generate a unit test case for the following Java method: "+instruction
print(prompt)

Generate a unit test case for the following Java method: 
public class SimpleCalculator {
    // Method to add two numbers
    public int add(int a, int b) {
        return a + b;
    }

    // Method to subtract two numbers
    public int subtract(int a, int b) {
        return a - b;
    }

    // Method to multiply two numbers
    public int multiply(int a, int b) {
        return a * b;
    }

    // Method to divide two numbers
    // Throws ArithmeticException if divisor is zero
    public double divide(int a, int b) {
        if (b == 0) {
            throw new ArithmeticException("Cannot divide by zero");
        }
        return (double) a / b;
    }
}



In [61]:
generated_test = generate_unit_tests(prompt)
print(generated_test)

The unit test case for the given Java method is: @Test public void testAdd() { int a = 2; int b = 3; Assert.assertEquals(a, b); }


In [62]:
torch.cuda.empty_cache()    # release CUDA

# Push to HF

push to hf when you think its capable of pushing onto hf!

In [63]:
from huggingface_hub import HfApi, HfFolder, Repository

In [64]:
repo_name = new_model
organization_name = "CodexAI"
repo_url = f"{organization_name}/{repo_name}"

In [67]:
model.push_to_hub(repo_url, private=True)
tokenizer.push_to_hub(repo_url, private=True)

Configuration saved in CODEX-codet5-base\config.json
Configuration saved in CODEX-codet5-base\generation_config.json
Model weights saved in CODEX-codet5-base\model.safetensors
Uploading the following files to CodexAI/CODEX-codet5-base: config.json,generation_config.json,model.safetensors,README.md
model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/68/2e/682eb423fabd5c7ac4ea38d9b332dcba8a9d2b552b7ce791a413cd9bb6a28473/8cd46ae424f1e303e5aeb7348615272def1a44e45b81decdfdbd3ee41cb7bc2d?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20240907%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240907T082129Z&X-Amz-Expires=86400&X-Amz-Signature=0fdaf62568bdbe5fbb1d409253337924667fe82a29bffd69bc991f706f8398e4&X-Amz-SignedHeaders=host&partNumber=1&uploadId=4J6glKUfrcQqpAXkAyIn1AQZK8pVwkOA4

In [None]:
print(f"Model and Tokenizer saved at {repo_url}")