In [1]:
import torch
from torch import nn

ModuleNotFoundError: No module named 'torch'

In [2]:
torch.cuda.get_device_properties(3)

_CudaDeviceProperties(name='NVIDIA A100 80GB PCIe', major=8, minor=0, total_memory=81037MB, multi_processor_count=108)

In [3]:
torch.cuda.set_device(3)

In [4]:
device = torch.device('cuda:3' if torch.cuda.is_available() else "cpu")
print(device)

cuda:3


In [5]:
torch.cuda.current_device()

3

### Pip installs

In [6]:
!pip install git+https://github.com/huggingface/transformers.git@main bitsandbytes accelerate==0.27.2 #0.20.3  # we need latest transformers for this
!pip install git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08
!pip install datasets==2.18.0 #2.10.1
import locale # colab workaround
locale.getpreferredencoding = lambda: "UTF-8" # colab workaround
!pip install wandb

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/huggingface/transformers.git@main
  Cloning https://github.com/huggingface/transformers.git (to revision main) to /tmp/pip-req-build-qjok0_45
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-qjok0_45
  Resolved https://github.com/huggingface/transformers.git to commit e34da3ee3c9d2d628fdbeb60cee45c4f8f32945a
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/huggingface/peft.git@e536616888d51b453ed354a6f1e243fecb02ea08
  Cloning https://github.com/huggingface/peft.git (to revision e536616888d51b453ed354a6f1e243fecb02ea08) to /tmp/pip-req-build-a42__ghe
  Running command git clone --

In [7]:
pip install huggingface_hub

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Loading Libraries

In [8]:
from datetime import datetime
import os
import sys

import torch
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq


2024-04-23 17:37:08.771976: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-23 17:37:08.815614: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Load Dataset

In [9]:
from datasets import load_dataset

dataset = load_dataset("YabTad/CodeRev_BiData", split="train")

data_train = dataset.train_test_split(train_size=0.9)["train"]
data_eval = dataset.train_test_split(train_size=0.1)["train"]

In [1]:
#data_test = load_dataset("YabTad/CodeRev_BiData", split="test")

In [10]:
data_train

Dataset({
    features: ['OriginalCode', 'ReviewedCode'],
    num_rows: 13927
})

In [11]:
data_eval

Dataset({
    features: ['OriginalCode', 'ReviewedCode'],
    num_rows: 1547
})

### Load Model

In [12]:
base_model = "codellama/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Tokenization
Setup some tokenization settings like left padding because it makes training use less memory:

In [14]:
tokenizer.pad_token = tokenizer.eos_token

tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

Setup the tokenize function to make labels and input_ids the same. This is basically what [self-supervised fine-tuning](https://neptune.ai/blog/self-supervised-learning) is:

In [15]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result

And run convert each data_point into a prompt that I found online that works quite well:

In [16]:
data_train

Dataset({
    features: ['OriginalCode', 'ReviewedCode'],
    num_rows: 13927
})

In [17]:
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""You are a code reviewing model. Your job is to improve Java code. You are given a Java method code as input. 

You must output the improved version of the code provided as input.

### Input:
{data_point["OriginalCode"]}

### Response:
{data_point["ReviewedCode"]}
"""
    return tokenize(full_prompt)

Reformat to prompt and tokenize each sample:

In [18]:
tokenized_train_dataset = data_train.map(generate_and_tokenize_prompt)
tokenized_val_dataset = data_eval.map(generate_and_tokenize_prompt)

Map:   0%|          | 0/13927 [00:00<?, ? examples/s]

Map:   0%|          | 0/1547 [00:00<?, ? examples/s]

In [19]:
tokenized_train_dataset

Dataset({
    features: ['OriginalCode', 'ReviewedCode', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 13927
})

In [20]:
tokenized_val_dataset

Dataset({
    features: ['OriginalCode', 'ReviewedCode', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1547
})

### Setup LoRA

In [21]:
model.train() # put model back into training mode
model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

To resume from a checkpoint, set resume_from_checkpoint to the path of the adapter_model.bin you want to resume from. This code'll replace the lora adapter attached to the model:

In [22]:
resume_from_checkpoint = "" # set this to the adapter_model.bin file you want to resume from

if resume_from_checkpoint:
    if os.path.exists(resume_from_checkpoint):
        print(f"Restarting from {resume_from_checkpoint}")
        adapters_weights = torch.load(resume_from_checkpoint)
        set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {resume_from_checkpoint} not found")

Optional stuff to setup Weights and Biases to view training graphs:

In [23]:
wandb_project = "Bi_CodeReview_codeLlama"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project
    os.environ['WANDB_NOTEBOOK_NAME'] = 'CodeReviewBi_CodeLlama'

In [24]:
if torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True

### Training arguments
If you run out of GPU memory, change per_device_train_batch_size. The gradient_accumulation_steps variable should ensure this doesn't affect batch dynamics during the training run. All the other variables are standard stuff that I wouldn't recommend messing with:

In [25]:
batch_size = 128
per_device_train_batch_size = 32
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = "BiCodeReview-llama-FinalRun_f"

training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        max_steps=400,
        learning_rate=3e-4,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=20,
        save_steps=20,
        output_dir=output_dir,
        load_best_model_at_end=False,
        group_by_length=True,
        report_to="wandb",
        run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
    )

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

max_steps is given, it will override any value given in num_train_epochs


Then we do some pytorch-related optimisation (which just make training faster but don't affect accuracy):

In [26]:
model.config.use_cache = False

old_state_dict = model.state_dict
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
    model, type(model)
)
if torch.__version__ >= "2" and sys.platform != "win32":
    print("compiling the model")
    model = torch.compile(model)

compiling the model


In [27]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mtadesse-yeabsira18[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
20,1.2432,1.079924
40,0.5372,0.442265
60,0.3333,0.318545
80,0.2918,0.289387
100,0.2782,0.260709
120,0.253,0.241371
140,0.2247,0.238494
160,0.2288,0.230341
180,0.2393,0.225947
200,0.2314,0.225471


TrainOutput(global_step=400, training_loss=0.31450404465198517, metrics={'train_runtime': 7947.7446, 'train_samples_per_second': 6.442, 'train_steps_per_second': 0.05, 'total_flos': 5.201427721833677e+17, 'train_loss': 0.31450404465198517, 'epoch': 3.669724770642202})

In [28]:
model.save_pretrained("bi_finetuned-model")