In [1]:
# !pip install -U transformers datasets accelerate evaluate huggingface_hub
# --OR--
# !pip install -r requirements.txt

In [1]:
import os
import torch
import logging
import transformers
from evaluate import load
from torch.optim import AdamW
from huggingface_hub import login
from datasets import load_dataset
from accelerate import Accelerator
from torch.utils.data import DataLoader
from transformers import (
    RobertaTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)


# Define Variables

In [2]:
base_model = "Salesforce/codet5-large"

new_model = "CODEX-codet5-large"

tokenizer_path = "tokenizer"

dataset_name = "CodexAI/Deepseek-Coder"

In [3]:
login('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',add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\walim\.cache\huggingface\token
Login successful


# Dataset
Load the dataset using `load_dataset()` but the dataset must be in `.parquet` format.
or else clone the dataset repo from HF, it's fast as fuck!

In [5]:
dataset = load_dataset(dataset_name)
print(dataset)

# Playing with Dataset

In [None]:
train=dataset['train']
print(train)

In [None]:
test=dataset['test']
print(test)

## Inspecting dataset instance
Here dataset instance is printed just to see how the dataset looks like, skip these steps if your are **gay**

In [None]:
print(train['instruction'][0])

In [None]:
print(train['output'][0])

In [None]:
print("Loading tokenizer...")
tokenizer = RobertaTokenizer.from_pretrained(base_model)

In [None]:
instruction = tokenizer(train['instruction'][0])
print(instruction)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(instruction.input_ids)
print(tokens)

In [None]:
tokenizer.convert_tokens_to_string(tokens)

In [None]:
print(f"Vocab size : {tokenizer.vocab_size}")
print(f"max length : {tokenizer.model_max_length}")
print(f"model input : {tokenizer.model_input_names}")

In [None]:
batch = tokenizer(train['instruction'][0],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
print(batch)

# Tokenizing Dataset

In [7]:
def tokenize_data(data):
  input_col=tokenizer(data['instruction'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
  target_col=tokenizer(data['output'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")

  return {
      "input_ids":input_col["input_ids"],
      "attention_mask":input_col["attention_mask"],
      "labels":target_col["input_ids"]
  }

In [8]:
print("Tokenizing dataset...")

Tokenizing dataset...


In [None]:
print("Mapping train data...")
train=train.map(tokenize_data,batched=True)
print(train)

In [None]:
print("Mappig test data...")
test=test.map(tokenize_data,batched=True)
print(test)

In [23]:
train=train.remove_columns(["instruction","output"])
test=test.remove_columns(["instruction","output"])

In [None]:
train = train.select(range(1000))  # seleting 1k dataset, you dont have to
print(train)

# Fine-tuning

In [9]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'trainable model parameters: {trainable_model_params}\n \
            all model parameters: {all_model_params} \n \
            percentage of trainable model parameters: {(trainable_model_params / all_model_params) * 100} %'

In [10]:
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    device={"":0}
    torch_type=torch.bfloat16
else:
    device="cpu"
    torch_type=torch.bfloat16
    print("I am begging for mercy already!")

CUDA device: NVIDIA GeForce RTX 3060 Ti


`tf16=True` For ampere GPUs only!

In [28]:
# torch._dynamo.config.suppress_errors = True
# torch.backends.cuda.matmul.allow_tf32 = True

else check for `bf16`, if False then use `fp16=True`

In [36]:
print(f"BF16 support is {transformers.utils.import_utils.is_torch_bf16_gpu_available()}")   # must check

BF16 support is True


In [39]:
os.environ["ACCELERATE_MIXED_PRECISION"] = "bf16"
os.environ["ACCELERATE_GRADIENT_ACCUMULATION_STEPS"] = "2"


2


## Loading base model

In [11]:
model = T5ForConditionalGeneration.from_pretrained(base_model,device_map=device)

In [12]:
print(model)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [13]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 737639424
             all model parameters: 737639424 
             percentage of trainable model parameters: 100.0 %


## LoRA Config for PEFT

no need for now

In [32]:
# lora_config = LoraConfig(
#     r=32,  # rank 16,32,64
#     lora_alpha=16, # LoRA Scaling factor keep 16 or 32
#     target_modules=['q', 'v'], # The modules(for example, attention blocks) to apply the LoRA update matrices.
#     lora_dropout = 0.1, # 0.05
#     bias='none',
#     task_type=TaskType.SEQ_2_SEQ_LM ## flan-t5
# )

In [33]:
# peft_model = get_peft_model(model, lora_config)
# print(peft_model)

In [34]:
# print(print_number_of_trainable_model_parameters(peft_model))

## Training args

In [36]:
# training_args = Seq2SeqTrainingArguments(
#     output_dir="./results",
#     run_name ="./loggings",
#     overwrite_output_dir=True,
#     eval_strategy="steps",
#     learning_rate=1e-4,
#     gradient_accumulation_steps=2,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
#     # auto_find_batch_size = True, # for CUDA out of memory 
#     weight_decay=0.01,
#     num_train_epochs=1,
#     tf32=True,  # Switching to FP16, BF16 = True on kaggle
#     optim="adamw_torch",
#     save_strategy="no",
#     log_level="info",
#     logging_first_step=True,
#     report_to='none'
# )

In [37]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model,
    # model=peft_model 
)

In [38]:
# trainer=Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=train,
#     eval_dataset=test,
#     data_collator=data_collator
# )

In [None]:
print("Starting trainer...")

In [15]:
!nvidia-smi

Sun Sep 15 19:32:26 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 561.09                 Driver Version: 561.09         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti   WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   47C    P8             18W /  240W |    3499MiB /   8192MiB |      6%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### `trainer.train()`

In [41]:
# trainer.train()

### or you can go with `Accelerator`

In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
epochs = 4  # set the epochs
accelerator = Accelerator()
optimizer = AdamW(model.parameters(), lr=5e-4, weight_decay=0.01)
train_dataloader = DataLoader(train, batch_size=1, shuffle=True, collate_fn=data_collator)
eval_dataloader = DataLoader(test, batch_size=1, collate_fn=data_collator)

In [None]:
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
logger.info("***** Running training *****")
logger.info(f"Num examples = {len(train_dataloader.dataset)}")
logger.info(f"Num Epochs = 1")
logger.info(f"Total optimization steps = {len(train_dataloader)}")

In [26]:
for epoch in range(epochs):
    logger.info(f"Starting epoch {epoch + 1}")
    model.train()

    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)  # Backpropagation, accelerator will take care of it
        optimizer.step()
        optimizer.zero_grad()

        if step % 10 == 0:
            logger.info(f"Epoch: {epoch + 1}, Step: {step}, Loss: {loss.item()}")

    # Evaluation loop (optional)
    total_eval_loss = 0
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)
            eval_loss = outputs.loss
            total_eval_loss += eval_loss.item()

        if step % 10 == 0:
            logger.info(f"Evaluation Step: {step}, Eval Loss: {eval_loss.item()}")

    avg_eval_loss = total_eval_loss / len(eval_dataloader)
    logger.info(f"Epoch {epoch + 1} Evaluation Complete. Average Eval Loss: {avg_eval_loss}")


INFO:__main__:***** Running training *****
INFO:__main__:  Num examples = 100
INFO:__main__:  Num Epochs = 1
INFO:__main__:  Total optimization steps = 100
INFO:__main__:Starting epoch 1
INFO:__main__:Epoch: 1, Step: 0, Loss: 20.871036529541016
INFO:__main__:Epoch: 1, Step: 10, Loss: 1.985264778137207
INFO:__main__:Epoch: 1, Step: 20, Loss: 6.07259464263916
INFO:__main__:Epoch: 1, Step: 30, Loss: 3.438760995864868
INFO:__main__:Epoch: 1, Step: 40, Loss: 2.1230361461639404
INFO:__main__:Epoch: 1, Step: 50, Loss: 2.151106595993042
INFO:__main__:Epoch: 1, Step: 60, Loss: 3.6292030811309814
INFO:__main__:Epoch: 1, Step: 70, Loss: 0.4435405135154724
INFO:__main__:Epoch: 1, Step: 80, Loss: 1.683504581451416
INFO:__main__:Epoch: 1, Step: 90, Loss: 0.4029475450515747
INFO:__main__:Evaluation Step: 0, Eval Loss: 2.8816373348236084
INFO:__main__:Evaluation Step: 10, Eval Loss: 0.9954522848129272
INFO:__main__:Evaluation Step: 20, Eval Loss: 4.729071617126465
INFO:__main__:Evaluation Step: 30, Ev

In [27]:
print("finished. Saving model...")
model.save_pretrained(new_model)
tokenizer.save_pretrained(tokenizer_path)
print(f"Model saved at : {new_model}")

finished. Saving model...
Model saved at : CODEX-codet5-large


In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [30]:
# Load model amd tokenier
model = T5ForConditionalGeneration.from_pretrained(new_model).to(device)
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path,device=device)

In [31]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32100, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [32]:
def generate_unit_tests(instruction):
    
  inputs = tokenizer(instruction, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
  inputs = {key: value.to(device) for key, value in inputs.items()}

  outputs = model.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=512,
      num_beams=5,
      do_sample=True,
      temperature=0.7,
      top_k=100,
      top_p=0.9,
      no_repeat_ngram_size=5,
      repetition_penalty=1.5,
      length_penalty=1.0,
      early_stopping=True
  )
  generated_test = tokenizer.decode(outputs[0], skip_special_tokens=True)

  return generated_test

In [33]:
instruction = """
public class SimpleCalculator {
    // Method to add two numbers
    public int add(int a, int b) {
        return a + b;
    }

    // Method to subtract two numbers
    public int subtract(int a, int b) {
        return a - b;
    }

    // Method to multiply two numbers
    public int multiply(int a, int b) {
        return a * b;
    }

    // Method to divide two numbers
    // Throws ArithmeticException if divisor is zero
    public double divide(int a, int b) {
        if (b == 0) {
            throw new ArithmeticException("Cannot divide by zero");
        }
        return (double) a / b;
    }
}
"""
prompt="Generate a unit test case for the following Java method: "+instruction
# print(prompt)

Generate a unit test case for the following Java method: 
public class SimpleCalculator {
    // Method to add two numbers
    public int add(int a, int b) {
        return a + b;
    }

    // Method to subtract two numbers
    public int subtract(int a, int b) {
        return a - b;
    }

    // Method to multiply two numbers
    public int multiply(int a, int b) {
        return a * b;
    }

    // Method to divide two numbers
    // Throws ArithmeticException if divisor is zero
    public double divide(int a, int b) {
        if (b == 0) {
            throw new ArithmeticException("Cannot divide by zero");
        }
        return (double) a / b;
    }
}



In [34]:
generated_test = generate_unit_tests(prompt)
print(generated_test)

 test caseTest unit. Java.(assert., newgetWork2Equals.);


# Push to HF

In [35]:
repo_name = new_model
repo_url = f"CodexAI/{repo_name}"

### use the `push_to_hub()` , but its shit

In [None]:
model.push_to_hub(repo_url, private=True)
tokenizer.push_to_hub(repo_url, private=True)

### use the `HfApi` which is recomended

In [None]:
from huggingface_hub import HfApi, create_repo

In [None]:
create_repo(repo_url, repo_type="model", private=True,exist_ok=True)

In [None]:
api = HfApi()
api.upload_folder(folder_path=new_model,repo_id=repo_url)

In [None]:
print(f"Model and Tokenizer saved at {repo_url}")

In [None]:
torch.cuda.empty_cache()    # release CUDA