In [1]:
%%capture
%pip install -U transformers datasets torch sentencepiece peft accelerate evaluate

In [2]:
import os
import json
import torch
import shutil
import logging
import transformers
import pandas as pd

In [3]:
from evaluate import load
from datasets import Dataset, load_dataset
from huggingface_hub import login, Repository

from transformers import (
    AdamW,
    RobertaTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel, 
    PeftConfig
)

# Define Variables

In [4]:
base_model = "Salesforce/codet5-base"

new_model = "codet5-base_CODEX"

model_path = "model"

tokenizer_path = "tokenizer"

dataset_path = "dataset"  # dataset dir path

dataset = "CodexAI/Eval4Deepseek-Coder"  # dataset name at huggingface

repo_url = f'https://huggingface.co/datasets/{dataset}'

In [5]:
if not os.path.exists(dataset_path):
    os.makedirs(dataset_path)

# Get Dataset
Clone the dataset from HF, it's fast as fuck!

In [11]:
# login('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')

In [12]:
print("Cloning Dataset...")

Cloning Dataset...


In [13]:
# repo = Repository(local_dir=dataset_dir,clone_from=repo_url)

# Playing with Dataset

In [14]:
def load_json_data(dir_name):
  """
  This function is used to load the json data from the given directory.
  After reading the data store them in a list
  After storing all the data in a list we can return the list.
  """

  data=[]
  for root_folder in os.listdir(dir_name):
    if root_folder!=".git" and root_folder!=".gitattributes":
      for files in os.listdir(os.path.join(dir_name,root_folder)):
        if files.endswith(".json"):
          with open(os.path.join(dir_name,root_folder,files),"r")as f:
            json_file=json.load(f)
            data.append(json_file)
  return data

In [15]:
print(f"Loading dataset from /{dataset_path}/...")

Loading dataset from /dataset/...


In [16]:
json_data=load_json_data(dataset_path)

In [17]:
print(f"Length of loaded dataset is: {len(json_data)}")

Length of loaded dataset is: 78534


In [18]:
tmp=json_data  # i case you want this again

In [22]:
json_data=json_data[:100]

In [23]:
print(f"Length of dataset is: {len(json_data)}")

Length of dataset is: 100


In [24]:
print("Loading dataset...")

Loading dataset...


In [25]:
df=Dataset.from_list(json_data)

In [26]:
df

Dataset({
    features: ['instruction', 'output'],
    num_rows: 100
})

In [27]:
df.features

{'instruction': Value(dtype='string', id=None),
 'output': Value(dtype='string', id=None)}

In [28]:
df['instruction'][0]

'Generate a unit test case for the following Java method: CompositeAppender extends UnsynchronizedAppenderBase<E> implements AppenderAttachable<E> { public Appender<E> getAppender(String name) { return aai.getAppender(name); }  void addAppender(Appender<E> newAppender); Iterator<Appender<E>> iteratorForAppenders(); Appender<E> getAppender(String name); boolean isAttached(Appender<E> eAppender); void detachAndStopAllAppenders(); boolean detachAppender(Appender<E> eAppender); boolean detachAppender(String name);  }'

In [29]:
df['output'][0]

'The unit test case for the given Java method is: @Test public void testSimpleAppender() throws JoranException { LoggerContext context = new LoggerContext(); URL resource = getClass().getResource("/logback-with-composite-appender.xml"); JoranConfigurator configurator = new JoranConfigurator(); configurator.setContext(context); configurator.doConfigure(resource); ch.qos.logback.classic.Logger logger = context.getLogger(Logger.ROOT_LOGGER_NAME); CompositeAppender<ILoggingEvent> composite = (CompositeAppender<ILoggingEvent>) logger.getAppender("CONSOLE_AND_FILE"); ListAppender<ILoggingEvent> file = (ListAppender<ILoggingEvent>) composite.getAppender("FILE"); ListAppender<ILoggingEvent> console = (ListAppender<ILoggingEvent>) composite.getAppender("CONSOLE"); logger.info("hello world"); assertThat(file.list.get(0).getMessage()).isEqualTo("hello world"); assertThat(console.list.get(0).getMessage()).isEqualTo("hello world"); }'

In [30]:
print("Spliting dataset...")

Spliting dataset...


In [31]:
df=df.train_test_split(test_size=0.2)

In [32]:
print(df)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 80
    })
    test: Dataset({
        features: ['instruction', 'output'],
        num_rows: 20
    })
})


In [33]:
train=df['train']
test=df['test']

In [34]:
train

Dataset({
    features: ['instruction', 'output'],
    num_rows: 80
})

In [35]:
test

Dataset({
    features: ['instruction', 'output'],
    num_rows: 20
})

In [36]:
print("Checking dataset...")

Checking dataset...


In [37]:
tokenizer = RobertaTokenizer.from_pretrained(base_model)

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]



In [38]:
instruction = tokenizer(train['instruction'][0])
print(instruction)

{'input_ids': [1, 4625, 279, 2836, 1842, 648, 364, 326, 3751, 5110, 707, 30, 4049, 2655, 288, 1071, 514, 8635, 1444, 1435, 288, 327, 787, 1444, 31, 289, 4049, 2655, 12, 780, 1048, 1769, 1250, 4908, 5621, 514, 9968, 1444, 5621, 1250, 353, 1638, 1444, 19323, 5621, 514, 8635, 1444, 5621, 1250, 353, 1685, 1444, 19323, 5621, 1250, 1914, 12, 780, 1177, 1769, 225, 289, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [39]:
tokens = tokenizer.convert_ids_to_tokens(instruction.input_ids)
print(tokens)

['<s>', 'Generate', 'Ġa', 'Ġunit', 'Ġtest', 'Ġcase', 'Ġfor', 'Ġthe', 'Ġfollowing', 'ĠJava', 'Ġmethod', ':', 'ĠVersion', 'Range', 'Ġ{', 'Ġpublic', 'ĠString', 'ĠgetStart', 'Version', '()', 'Ġ{', 'Ġreturn', 'Ġstart', 'Version', ';', 'Ġ}', 'ĠVersion', 'Range', '(', 'String', 'Ġrange', ');', 'Ġboolean', 'ĠisValid', '();', 'ĠString', 'ĠgetEnd', 'Version', '();', 'Ġboolean', 'Ġis', 'End', 'Version', 'Included', '();', 'ĠString', 'ĠgetStart', 'Version', '();', 'Ġboolean', 'Ġis', 'Start', 'Version', 'Included', '();', 'Ġboolean', 'Ġcontains', '(', 'String', 'Ġversion', ');', 'Ġ', 'Ġ}', '</s>']


In [40]:
tokenizer.convert_tokens_to_string(tokens)

'<s>Generate a unit test case for the following Java method: VersionRange { public String getStartVersion() { return startVersion; } VersionRange(String range); boolean isValid(); String getEndVersion(); boolean isEndVersionIncluded(); String getStartVersion(); boolean isStartVersionIncluded(); boolean contains(String version);  }</s>'

In [41]:
print(f"Vocab size : {tokenizer.vocab_size}")

Vocab size : 32100


In [42]:
print(f"max length : {tokenizer.model_max_length}")

max length : 512


In [43]:
print(f"model input : {tokenizer.model_input_names}")

model input : ['input_ids', 'attention_mask']


In [44]:
batch = tokenizer(train['instruction'][0],max_length=512,truncation=True,padding="max_length",return_tensors="pt")

In [45]:
batch

{'input_ids': tensor([[    1,  4625,   279,  2836,  1842,   648,   364,   326,  3751,  5110,
           707,    30,  4049,  2655,   288,  1071,   514,  8635,  1444,  1435,
           288,   327,   787,  1444,    31,   289,  4049,  2655,    12,   780,
          1048,  1769,  1250,  4908,  5621,   514,  9968,  1444,  5621,  1250,
           353,  1638,  1444, 19323,  5621,   514,  8635,  1444,  5621,  1250,
           353,  1685,  1444, 19323,  5621,  1250,  1914,    12,   780,  1177,
          1769,   225,   289,     2,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [46]:
print("Tokenizing dataset...")

Tokenizing dataset...


In [47]:
def tokenize_data(data):
  input_col=tokenizer(data['instruction'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
  target_col=tokenizer(data['output'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")

  return {
      "input_ids":input_col["input_ids"],
      "attention_mask":input_col["attention_mask"],
      "labels":target_col["input_ids"]
  }

In [48]:
print("Mapping train data...")
train=train.map(tokenize_data,batched=True)

Mapping train data...


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [49]:
train

Dataset({
    features: ['instruction', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 80
})

In [50]:
print("Mappig test data...")
test=test.map(tokenize_data,batched=True)

Mappig test data...


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [51]:
test

Dataset({
    features: ['instruction', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 20
})

In [52]:
train=train.remove_columns(["instruction","output"])
test=test.remove_columns(["instruction","output"])

In [53]:
train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 80
})

# Fine-tuning

In [54]:
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    device="cuda"
    torch_type=torch.bfloat16
else:
    device="cpu"
    torch_type=torch.bfloat16

CUDA device: Tesla P100-PCIE-16GB


In [55]:
peft_config=LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q","v"]
)

In [57]:
model = T5ForConditionalGeneration.from_pretrained(base_model,device_map=device)

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [59]:
peft_model=get_peft_model(model,peft_config)

In [60]:
print(f"BF16 support is {transformers.file_utils.is_torch_bf16_available()}")

BF16 support is True




In [61]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    gradient_accumulation_steps=4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=1,
    bf16=True,
    save_strategy="no"
)

In [62]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [63]:
trainer=Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator
)

In [64]:
print("Start trainer...")

Start trainer...


In [65]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,3.292203


TrainOutput(global_step=5, training_loss=4.463361740112305, metrics={'train_runtime': 57.5015, 'train_samples_per_second': 1.391, 'train_steps_per_second': 0.087, 'total_flos': 49151496683520.0, 'train_loss': 4.463361740112305, 'epoch': 1.0})

In [66]:
print("finished. Saving model...")

finished. Saving model...


In [67]:
peft_model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.json',
 'tokenizer/merges.txt',
 'tokenizer/added_tokens.json')

# Evaluation

In [68]:
config = PeftConfig.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(base_model)
model = PeftModel.from_pretrained(model,model_path,is_trainable=True)

In [69]:
# check if it's working
model.print_trainable_parameters()

trainable params: 1,769,472 || all params: 224,651,520 || trainable%: 0.7877


In [70]:
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)

In [72]:
df

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 80
    })
    test: Dataset({
        features: ['instruction', 'output'],
        num_rows: 20
    })
})

In [73]:
eval=df['test']

In [74]:
eval

Dataset({
    features: ['instruction', 'output'],
    num_rows: 20
})

## BLEU

In [76]:
bleu = load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [75]:
references = [[example['instruction']] for example in eval]
predictions = [example['output'] for example in eval]

In [77]:
bleu_score = bleu.compute(references=references, predictions=predictions)

In [81]:
print("BLEU score:", bleu_score['bleu'])

BLEU score: 0.08717793873845739


## code_eval

In [71]:
code_eval = load("code_eval")

Downloading builder script:   0%|          | 0.00/9.18k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/6.10k [00:00<?, ?B/s]

In [84]:
test_cases = [example['instruction'] for example in eval]
candidates = [[example['output']] for example in eval]

In [85]:
os.environ["HF_ALLOW_CODE_EVAL"] = "1"

pass_at_k, results = code_eval.compute(references=test_cases, predictions=candidates, k=[1, 2])

In [86]:
# Print the pass@1 and pass@2 scores
print("Pass@k:", pass_at_k)

Pass@k: {'pass@1': 0.0}


In [87]:
# Iterate over the results and print each one
for task_id, result_list in results.items():
    for result in result_list:
        idx, details = result
        print(f"Task ID: {details['task_id']}, Passed: {details['passed']}, Result: {details['result']}")

Task ID: 1, Passed: False, Result: failed: invalid syntax (<string>, line 1)
Task ID: 0, Passed: False, Result: failed: invalid syntax (<string>, line 1)
Task ID: 2, Passed: False, Result: failed: invalid syntax (<string>, line 1)
Task ID: 3, Passed: False, Result: failed: invalid syntax (<string>, line 1)
Task ID: 4, Passed: False, Result: failed: invalid syntax (<string>, line 1)
Task ID: 5, Passed: False, Result: failed: invalid syntax (<string>, line 1)
Task ID: 6, Passed: False, Result: failed: invalid syntax (<string>, line 1)
Task ID: 7, Passed: False, Result: failed: closing parenthesis '}' does not match opening parenthesis '(' (<string>, line 1)
Task ID: 8, Passed: False, Result: failed: invalid syntax (<string>, line 1)
Task ID: 10, Passed: False, Result: failed: invalid syntax (<string>, line 1)
Task ID: 9, Passed: False, Result: failed: invalid syntax (<string>, line 1)
Task ID: 11, Passed: False, Result: failed: invalid syntax (<string>, line 1)
Task ID: 12, Passed: False

# Inference

In [88]:
def generate_unit_tests(instruction):
    
  inputs = tokenizer(instruction, max_length=512, truncation=True, padding="max_length", return_tensors="pt")

  outputs = model.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=512,
      num_beams=5,
      do_sample=True,  # Enable sampling for diverse output
      temperature=0.2,  # Control randomness
      top_k=100,  # Limit the sampling pool to top K tokens
      top_p=0.9,
      no_repeat_ngram_size=5,
      repetition_penalty=1.5,
      length_penalty=1.0,
      early_stopping=True
  )

  # Decode the generated output
  generated_test = tokenizer.decode(outputs[0], skip_special_tokens=True)

  return generated_test

In [89]:
instruction = """
public class SimpleCalculator {
    // Method to add two numbers
    public int add(int a, int b) {
        return a + b;
    }

    // Method to subtract two numbers
    public int subtract(int a, int b) {
        return a - b;
    }

    // Method to multiply two numbers
    public int multiply(int a, int b) {
        return a * b;
    }

    // Method to divide two numbers
    // Throws ArithmeticException if divisor is zero
    public double divide(int a, int b) {
        if (b == 0) {
            throw new ArithmeticException("Cannot divide by zero");
        }
        return (double) a / b;
    }
}
"""
prompt="Generate a unit test case for the following Java method: "+instruction
print(prompt)

Generate a unit test case for the following Java method: 
public class SimpleCalculator {
    // Method to add two numbers
    public int add(int a, int b) {
        return a + b;
    }

    // Method to subtract two numbers
    public int subtract(int a, int b) {
        return a - b;
    }

    // Method to multiply two numbers
    public int multiply(int a, int b) {
        return a * b;
    }

    // Method to divide two numbers
    // Throws ArithmeticException if divisor is zero
    public double divide(int a, int b) {
        if (b == 0) {
            throw new ArithmeticException("Cannot divide by zero");
        }
        return (double) a / b;
    }
}



In [91]:
generated_test = generate_unit_tests(prompt)
print(generated_test)

public static void test( int a, intint b) {a * b;return a / b;double divide(int a, int b)Math.floor(a/b); }{} // Method todivide(int a,int b) {
           = (double) a % b;return a * b;// Method to multiplytwo numbers
    publicint


# Push to HF

In [92]:
from huggingface_hub import HfApi, HfFolder, Repository

In [93]:
repo_name = "CodeT5"
organization_name = "CodexAI"
repo_url = f"{organization_name}/{repo_name}"

In [94]:
model.push_to_hub(repo_url)
tokenizer.push_to_hub(repo_url)

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-66cba040-5a66407133681e8e0dfb34a4;6a01727d-f96f-4236-9317-d3279aa88340)

Invalid username or password.