In [1]:
%%capture
%pip install -U transformers datasets torch sentencepiece peft accelerate evaluate

In [2]:
import os
import json
import torch
import shutil
import logging
import transformers
import pandas as pd

In [3]:
from evaluate import load
from datasets import Dataset, load_dataset
from huggingface_hub import login, Repository

from transformers import (
    AdamW,
    RobertaTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel, 
    PeftConfig
)

# Define Variables

In [5]:
base_model = "Salesforce/codet5-base"

new_model = "CODEX-codet5-base" # this is name used to push on hf

model_path = "model"

tokenizer_path = "tokenizer"

dataset_path = "dataset"  # dataset dir path

dataset = "CodexAI/Eval4Deepseek-Coder"  # dataset name at huggingface

repo_url = f'https://huggingface.co/datasets/{dataset}'

In [10]:
# shutil.rmtree("wandb")

In [None]:
# if not os.path.exists(dataset_path):
#     os.makedirs(dataset_path)

# Get Dataset
Clone the dataset from HF, it's fast as fuck!

In [None]:
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()

In [None]:
# login(user_secrets.get_secret("HF_TOKEN"))

In [None]:
# print("Cloning Dataset...")

In [None]:
# repo = Repository(local_dir=dataset_dir,clone_from=repo_url)

# Playing with Dataset

In [11]:
def load_json_data(dir_name):
  """
  This function is used to load the json data from the given directory.
  After reading the data store them in a list
  After storing all the data in a list we can return the list.
  """

  data=[]
  for root_folder in os.listdir(dir_name):
    if root_folder!=".git" and root_folder!=".gitattributes":
      for files in os.listdir(os.path.join(dir_name,root_folder)):
        if files.endswith(".json"):
          with open(os.path.join(dir_name,root_folder,files),"r")as f:
            json_file=json.load(f)
            data.append(json_file)
  return data

In [14]:
print(f"Loading dataset from /{dataset_path}/...")
json_data=load_json_data(dataset_path)

Loading dataset from /dataset/...


In [15]:
print(f"Length of loaded dataset is: {len(json_data)}")

Length of loaded dataset is: 78534


In [16]:
tmp=json_data  # in case if this is required again

## Dataset Limit = 1000
Dataset limit is set to 1000 and this bcz of testing this script. For actual training change this value
`json_data[:1000]` to something greater or simply comment the cell below to use the complete dataset

In [19]:
json_data=json_data[:1000]
print(f"Length of dataset is: {len(json_data)}")

Length of dataset is: 1000


In [21]:
print("Loading dataset...")
df=Dataset.from_list(json_data)

Loading dataset...


## Inspecting dataset instance
Here dataset instance are printed just to see the dataset, skip these steps bcz you like to skip steps

In [22]:
df

Dataset({
    features: ['instruction', 'output'],
    num_rows: 1000
})

In [23]:
df.features

{'instruction': Value(dtype='string', id=None),
 'output': Value(dtype='string', id=None)}

In [24]:
df['instruction'][0]

'Generate a unit test case for the following Java method: BusinessListPresenterImpl implements BusinessListPresenter { @Override public void onShowBusinessesAroundCoordinates() { CoordinatesModel coordinatesModel = modelFactory.getCoordinates(view); showBusinessesAroundMessage(coordinatesModel.toString()); showBusinessesAroundCoordinates(coordinatesModel); } @Inject  BusinessListPresenterImpl(GetBusinessesAroundLocation getBusinessesAroundLocation,\n                              GetBusinessesAroundCoordinates getBusinessesAroundCoordinates,\n                              ModelMapperHolder modelMapperHolder,\n                              BusinessObserverFactory businessObserverFactory,\n                              UseCaseHandler useCaseHandler,\n                              BusinessListModelFactory modelFactory); @Override void onViewInitialized(@NonNull BusinessListView view); @Override void onShowBusinessesAroundLocation(); @Override void onShowBusinessesAroundCoordinates();  }'

In [25]:
df['output'][0]

'The unit test case for the given Java method is: @Test public void showBusinessesAroundCoordinates_executesGetBusinessesAroundCoordinatesUseCase() throws Exception { Coordinates coordinates = mock(Coordinates.class); BusinessObserver observer = mock(BusinessObserver.class); given_onShowBusinessesAroundCoordinates(coordinates, observer, ""); testSubject.onShowBusinessesAroundCoordinates(); verify(useCaseHandler).execute(getBusinessesAroundCoordinates, coordinates, observer); }'

## train test split
If you want to evaluate the model on other dataset then load that dataset and skip these steps or else just run these!

In [26]:
print("Spliting dataset...")
df=df.train_test_split(test_size=0.2)

Spliting dataset...


In [27]:
print(df)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 800
    })
    test: Dataset({
        features: ['instruction', 'output'],
        num_rows: 200
    })
})


In [28]:
train=df['train']
test=df['test']

In [29]:
train

Dataset({
    features: ['instruction', 'output'],
    num_rows: 800
})

In [30]:
test

Dataset({
    features: ['instruction', 'output'],
    num_rows: 200
})

In [32]:
print("Loading tokenizer...")
tokenizer = RobertaTokenizer.from_pretrained(base_model)

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]



In [33]:
instruction = tokenizer(train['instruction'][0])
print(instruction)

{'input_ids': [1, 4625, 279, 2836, 1842, 648, 364, 326, 3751, 5110, 707, 30, 776, 1155, 74, 1179, 4792, 467, 1179, 288, 632, 6618, 1071, 987, 32, 45, 24916, 34, 1623, 12, 780, 2076, 13, 288, 987, 32, 1180, 34, 1516, 273, 446, 31, 514, 1983, 1489, 31, 1983, 1489, 273, 3970, 74, 1785, 18, 5290, 1435, 397, 1623, 21246, 74, 1785, 743, 18, 5290, 1435, 397, 315, 315, 397, 2076, 18, 5290, 1435, 397, 1623, 21246, 74, 1785, 743, 21, 18, 5290, 1435, 397, 467, 1179, 18, 7570, 9199, 18, 5290, 1435, 397, 1623, 21246, 74, 1785, 743, 22, 18, 5290, 5621, 776, 1155, 74, 7817, 2071, 273, 261, 21246, 74, 7817, 13, 3127, 2283, 18, 588, 1290, 921, 12, 2854, 1489, 16, 776, 1155, 74, 7817, 18, 1106, 1769, 1516, 273, 2071, 18, 588, 2909, 7675, 588, 3126, 5621, 987, 32, 45, 24916, 34, 1623, 3447, 273, 394, 2407, 32, 45, 24916, 34, 5621, 364, 12, 1180, 277, 294, 1516, 13, 288, 467, 24916, 31935, 273, 394, 29740, 5621, 31935, 18, 542, 3291, 12, 77, 18, 588, 9581, 1626, 10663, 31935, 18, 542, 548, 12, 77, 18, 588

In [34]:
tokens = tokenizer.convert_ids_to_tokens(instruction.input_ids)
print(tokens)

['<s>', 'Generate', 'Ġa', 'Ġunit', 'Ġtest', 'Ġcase', 'Ġfor', 'Ġthe', 'Ġfollowing', 'ĠJava', 'Ġmethod', ':', 'ĠV', 'ia', 'f', 'Service', 'Ġimplements', 'ĠI', 'Service', 'Ġ{', 'Ġ@', 'Override', 'Ġpublic', 'ĠList', '<', 'I', 'SearchResult', '>', 'Ġsearch', '(', 'String', 'Ġword', ')', 'Ġ{', 'ĠList', '<', 'Item', '>', 'Ġitems', 'Ġ=', 'Ġnull', ';', 'ĠString', 'Ġfull', 'Url', ';', 'Ġfull', 'Url', 'Ġ=', 'Ġvia', 'f', 'URL', '.', 'trim', '()', 'Ġ+', 'Ġsearch', 'Via', 'f', 'URL', 'Path', '.', 'trim', '()', 'Ġ+', 'Ġ"', 'Ġ"', 'Ġ+', 'Ġword', '.', 'trim', '()', 'Ġ+', 'Ġsearch', 'Via', 'f', 'URL', 'Path', '1', '.', 'trim', '()', 'Ġ+', 'ĠI', 'Service', '.', 'START', 'INDEX', '.', 'trim', '()', 'Ġ+', 'Ġsearch', 'Via', 'f', 'URL', 'Path', '2', '.', 'trim', '();', 'ĠV', 'ia', 'f', 'Reply', 'Ġrep', 'Ġ=', 'Ġ(', 'Via', 'f', 'Reply', ')', 'Ġrest', 'Template', '.', 'get', 'For', 'Object', '(', 'full', 'Url', ',', 'ĠV', 'ia', 'f', 'Reply', '.', 'class', ');', 'Ġitems', 'Ġ=', 'Ġrep', '.', 'get', 'Channel', '().

In [35]:
tokenizer.convert_tokens_to_string(tokens)

'<s>Generate a unit test case for the following Java method: ViafService implements IService { @Override public List<ISearchResult> search(String word) { List<Item> items = null; String fullUrl; fullUrl = viafURL.trim() + searchViafURLPath.trim() + " " + word.trim() + searchViafURLPath1.trim() + IService.STARTINDEX.trim() + searchViafURLPath2.trim(); ViafReply rep = (ViafReply) restTemplate.getForObject(fullUrl, ViafReply.class); items = rep.getChannel().getItems(); List<ISearchResult> searchResults = new ArrayList<ISearchResult>(); for(Item i : items) { ISearchResult searchResult = new SearchResult(); searchResult.setDescription(i.getPubDate()); searchResult.setId(i.getLink()); searchResult.setName(i.getTitle()); searchResults.add(searchResult); } return searchResults; }  @Override String getServiceId(); @Override String getName(); @Override List<ISearchResult> search(String word);  }</s>'

In [36]:
print(f"Vocab size : {tokenizer.vocab_size}")

Vocab size : 32100


In [37]:
print(f"max length : {tokenizer.model_max_length}")

max length : 512


In [38]:
print(f"model input : {tokenizer.model_input_names}")

model input : ['input_ids', 'attention_mask']


In [39]:
batch = tokenizer(train['instruction'][0],max_length=512,truncation=True,padding="max_length",return_tensors="pt")

In [40]:
batch

{'input_ids': tensor([[    1,  4625,   279,  2836,  1842,   648,   364,   326,  3751,  5110,
           707,    30,   776,  1155,    74,  1179,  4792,   467,  1179,   288,
           632,  6618,  1071,   987,    32,    45, 24916,    34,  1623,    12,
           780,  2076,    13,   288,   987,    32,  1180,    34,  1516,   273,
           446,    31,   514,  1983,  1489,    31,  1983,  1489,   273,  3970,
            74,  1785,    18,  5290,  1435,   397,  1623, 21246,    74,  1785,
           743,    18,  5290,  1435,   397,   315,   315,   397,  2076,    18,
          5290,  1435,   397,  1623, 21246,    74,  1785,   743,    21,    18,
          5290,  1435,   397,   467,  1179,    18,  7570,  9199,    18,  5290,
          1435,   397,  1623, 21246,    74,  1785,   743,    22,    18,  5290,
          5621,   776,  1155,    74,  7817,  2071,   273,   261, 21246,    74,
          7817,    13,  3127,  2283,    18,   588,  1290,   921,    12,  2854,
          1489,    16,   776,  1155,  

In [41]:
print("Tokenizing dataset...")

Tokenizing dataset...


In [46]:
def tokenize_data(data):
  input_col=tokenizer(data['instruction'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
  target_col=tokenizer(data['output'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")

  return {
      "input_ids":input_col["input_ids"],
      "attention_mask":input_col["attention_mask"],
      "labels":target_col["input_ids"]
  }

In [47]:
print("Mapping train data...")
train=train.map(tokenize_data,batched=True)

Mapping train data...


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [48]:
train

Dataset({
    features: ['instruction', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 800
})

In [49]:
print("Mappig test data...")
test=test.map(tokenize_data,batched=True)

Mappig test data...


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [50]:
test

Dataset({
    features: ['instruction', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 200
})

In [51]:
train=train.remove_columns(["instruction","output"])
test=test.remove_columns(["instruction","output"])

In [52]:
train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 800
})

# Fine-tuning

In [64]:
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    device={"":0}
    torch_type=torch.bfloat16
else:
    device="cpu"
    torch_type=torch.bfloat16

CUDA device: Tesla P100-PCIE-16GB


In [65]:
model = T5ForConditionalGeneration.from_pretrained(base_model,device_map=device)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Salesforce--codet5-base/snapshots/4078456db09ba972a3532827a0b5df4da172323c/config.json
Model config T5Config {
  "_name_or_path": "/content/drive/MyDrive/CodeT5/pretrained_models/codet5_base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 1,
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "

In [66]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'trainable model parameters: {trainable_model_params}\n \
            all model parameters: {all_model_params} \n \
            percentage of trainable model parameters: {(trainable_model_params / all_model_params) * 100} %'

In [67]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 222882048
             all model parameters: 222882048 
             percentage of trainable model parameters: 100.0 %


In [68]:
lora_config = LoraConfig(
    r=32, #rank 32,
    lora_alpha=32, ## LoRA Scaling factor 
    target_modules=['q', 'v'], ## The modules(for example, attention blocks) to apply the LoRA update matrices.
    lora_dropout = 0.05,
    bias='none',
    task_type=TaskType.SEQ_2_SEQ_LM ## flan-t5
)

In [69]:
peft_model = get_peft_model(model, lora_config)

In [70]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
             all model parameters: 226420992 
             percentage of trainable model parameters: 1.5629928871612753 %


In [71]:
print(f"BF16 support is {transformers.file_utils.is_torch_bf16_available()}")

BF16 support is True




In [72]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    run_name ="./loggings",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=5e-5, # default, change to (1e-3) later
    gradient_accumulation_steps=1,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
    auto_find_batch_size = True, # for CUDA out of memory 
    weight_decay=0.01,
    num_train_epochs=1,
    bf16=True,
    optim="adamw_hf",
    save_strategy="no",
    log_level="info",
    logging_first_step=True,
    report_to='none' ## can be wandb, but we dont need right now!
#     torch_empty_cache_steps=1
)

PyTorch: setting up devices


In [73]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=peft_model)

In [74]:
trainer=Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator
)

Using auto half precision backend


In [76]:
print("Starting trainer...")

Starting trainer...


In [78]:
trainer.train()

***** Running training *****
  Num examples = 800
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  Number of trainable parameters = 3,538,944


Epoch,Training Loss,Validation Loss
1,4.2129,1.502766



***** Running Evaluation *****
  Num examples = 200
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=100, training_loss=2.6558797931671143, metrics={'train_runtime': 145.9267, 'train_samples_per_second': 5.482, 'train_steps_per_second': 0.685, 'total_flos': 495863621222400.0, 'train_loss': 2.6558797931671143, 'epoch': 1.0})

In [79]:
print("finished. Saving model...")

finished. Saving model...


In [80]:
peft_model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Salesforce--codet5-base/snapshots/4078456db09ba972a3532827a0b5df4da172323c/config.json
Model config T5Config {
  "_name_or_path": "/content/drive/MyDrive/CodeT5/pretrained_models/codet5_base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 1,
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.json',
 'tokenizer/merges.txt',
 'tokenizer/added_tokens.json')

In [81]:
# torch.cuda.empty_cache()

# Evaluation

In [82]:
config = PeftConfig.from_pretrained("model")
model = T5ForConditionalGeneration.from_pretrained(base_model)
model = PeftModel.from_pretrained(model,"model",is_trainable=True)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Salesforce--codet5-base/snapshots/4078456db09ba972a3532827a0b5df4da172323c/config.json
Model config T5Config {
  "_name_or_path": "/content/drive/MyDrive/CodeT5/pretrained_models/codet5_base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 1,
  "classifier_dropout": 0.0,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "

In [83]:
# check if it's working
model.print_trainable_parameters()

trainable params: 3,538,944 || all params: 226,420,992 || trainable%: 1.5630


In [90]:
model.eval()

# this must have (base_model)

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32100, 768)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32100, 768)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=32, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=32, out_features=768, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
            

In [84]:
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

loading file vocab.json
loading file merges.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json


In [85]:
df

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 800
    })
    test: Dataset({
        features: ['instruction', 'output'],
        num_rows: 200
    })
})

In [86]:
eval=df['test']

In [87]:
eval

Dataset({
    features: ['instruction', 'output'],
    num_rows: 200
})

In [88]:
tokenized_input = tokenizer([example['instruction'] for example in eval], return_tensors="pt", padding=True, truncation=True, max_length=512)

In [91]:
predicted_outputs = model.base_model.generate(input_ids=tokenized_input["input_ids"], attention_mask=tokenized_input["attention_mask"], max_length=512)

In [92]:
predictions = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predicted_outputs]

In [93]:
# Set up the environment variable for code evaluation
os.environ["HF_ALLOW_CODE_EVAL"] = "1"

## BLEU

In [94]:
bleu = load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [95]:
# Use the output from the dataset as references
references = [[example['output']] for example in eval]

In [96]:
bleu_score = bleu.compute(references=references, predictions=predictions)
print("BLEU score:", bleu_score['bleu'])

BLEU score: 0.009989032637423302


## code_eval

In [97]:
code_eval = load("code_eval")

Downloading builder script:   0%|          | 0.00/9.18k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/6.10k [00:00<?, ?B/s]

In [98]:
# Convert predictions to the required format for code_eval
candidates = [[pred] for pred in predictions]

In [99]:
# Original instructions (methods) are the test cases
test_cases = [example['output'] for example in eval]

In [100]:
pass_at_k, results = code_eval.compute(references=test_cases, predictions=candidates, k=[1, 2])
print("Pass@k:", pass_at_k)

  self.pid = os.fork()


Pass@k: {'pass@1': 0.0}


# Inference

In [103]:
def generate_unit_tests(instruction):
    
  inputs = tokenizer(instruction, max_length=512, truncation=True, padding="max_length", return_tensors="pt")

  outputs = model.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=512,
      num_beams=5,
      do_sample=True,  # Enable sampling for diverse output
      temperature=0.2,  # Control randomness
      top_k=100,  # Limit the sampling pool to top K tokens
      top_p=0.9,
      no_repeat_ngram_size=5,
      repetition_penalty=1.5,
      length_penalty=1.0,
      early_stopping=True
  )

  # Decode the generated output
  generated_test = tokenizer.decode(outputs[0], skip_special_tokens=True)

  return generated_test

In [104]:
instruction = """
public class SimpleCalculator {
    // Method to add two numbers
    public int add(int a, int b) {
        return a + b;
    }

    // Method to subtract two numbers
    public int subtract(int a, int b) {
        return a - b;
    }

    // Method to multiply two numbers
    public int multiply(int a, int b) {
        return a * b;
    }

    // Method to divide two numbers
    // Throws ArithmeticException if divisor is zero
    public double divide(int a, int b) {
        if (b == 0) {
            throw new ArithmeticException("Cannot divide by zero");
        }
        return (double) a / b;
    }
}
"""
prompt="Generate a unit test case for the following Java method: "+instruction
print(prompt)

Generate a unit test case for the following Java method: 
public class SimpleCalculator {
    // Method to add two numbers
    public int add(int a, int b) {
        return a + b;
    }

    // Method to subtract two numbers
    public int subtract(int a, int b) {
        return a - b;
    }

    // Method to multiply two numbers
    public int multiply(int a, int b) {
        return a * b;
    }

    // Method to divide two numbers
    // Throws ArithmeticException if divisor is zero
    public double divide(int a, int b) {
        if (b == 0) {
            throw new ArithmeticException("Cannot divide by zero");
        }
        return (double) a / b;
    }
}



In [105]:
generated_test = generate_unit_tests(prompt)
print(generated_test)

{
            = 0.0;return a / b;double multiply(int a, int b){
            = (double) aa % b;}Method to subtract two numbersint subtract(int a,int b) {return a - b;int add(int a,int b) {return a- b;
    }// Method tosubtract(int a,


In [106]:
torch.cuda.empty_cache()

# Push to HF

In [107]:
from huggingface_hub import HfApi, HfFolder, Repository

In [108]:
repo_name = new_model
organization_name = "CodexAI"
repo_url = f"{organization_name}/{repo_name}"

In [109]:
peft_model.push_to_hub(repo_url)
tokenizer.push_to_hub(repo_url)