In [4]:
# %%capture 
# %pip install -r requirements.txt

In [1]:
import torch
import transformers

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer
from datasets import load_dataset
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = "deepseek-coder-1.3b-instruct"  # model to be fine-tuned

base_model = f"deepseek-ai/{model}"

new_model = f"CODEX-{model}"  # fine-tunned model

dataset = "CodexAI/Deepseek-Coder"  # dataset name at huggingface

In [3]:
login('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',add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\walim\.cache\huggingface\token
Login successful


In [4]:
df = load_dataset(dataset)

In [5]:
df

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 78434
    })
    test: Dataset({
        features: ['instruction', 'output'],
        num_rows: 100
    })
})

In [6]:
df['train'][56]

{'instruction': 'Generate a unit test case for the following Java method: BundleEntityBuilder { public Map<String, BundleArtifacts> build(Bundle bundle, BundleType bundleType, Document document, ProjectInfo projectInfo) { return build(bundle, bundleType, document, projectInfo, false); } @Inject  BundleEntityBuilder(final Set<EntityBuilder> entityBuilders, final BundleDocumentBuilder bundleDocumentBuilder,\n                        final BundleMetadataBuilder bundleMetadataBuilder,\n                        final EntityTypeRegistry entityTypeRegistry, PrivateKeyImportContextBuilder privateKeyImportContextBuilder); Map<String, BundleArtifacts> build(Bundle bundle, BundleType bundleType,\n                                              Document document, ProjectInfo projectInfo); Map<String, BundleArtifacts> build(Bundle bundle, BundleType bundleType,\n                                              Document document, ProjectInfo projectInfo, boolean generateMetadata); void addPrivateKeyContext

In [7]:
def formatting_func(ex):
    ex["text"] = ex["instruction"] + "\n" + ex["output"]
    return ex

In [8]:
print(formatting_func(df['train'][56])["text"])

Generate a unit test case for the following Java method: BundleEntityBuilder { public Map<String, BundleArtifacts> build(Bundle bundle, BundleType bundleType, Document document, ProjectInfo projectInfo) { return build(bundle, bundleType, document, projectInfo, false); } @Inject  BundleEntityBuilder(final Set<EntityBuilder> entityBuilders, final BundleDocumentBuilder bundleDocumentBuilder,
                        final BundleMetadataBuilder bundleMetadataBuilder,
                        final EntityTypeRegistry entityTypeRegistry, PrivateKeyImportContextBuilder privateKeyImportContextBuilder); Map<String, BundleArtifacts> build(Bundle bundle, BundleType bundleType,
                                              Document document, ProjectInfo projectInfo); Map<String, BundleArtifacts> build(Bundle bundle, BundleType bundleType,
                                              Document document, ProjectInfo projectInfo, boolean generateMetadata); void addPrivateKeyContexts(Bundle bundle, Proj

In [9]:
df = df.map(formatting_func)

In [10]:
print(df["test"]["text"][0])

Generate a unit test case for the following Java method: CoderUtil { static <T> int[] getNullIndexes(T[] inputs) { int[] nullIndexes = new int[inputs.length]; int idx = 0; for (int i = 0; i < inputs.length; i++) { if (inputs[i] == null) { nullIndexes[idx++] = i; } } return Arrays.copyOf(nullIndexes, idx); } private  CoderUtil();   }
The unit test case for the given Java method is: @Test public void testGetNullIndexes() { byte[][] inputs = new byte[numInputs][]; inputs[0] = new byte[chunkSize]; inputs[1] = new byte[chunkSize]; for (int i = 2; i < 7; i++) { inputs[i] = null; } inputs[7] = new byte[chunkSize]; inputs[8] = new byte[chunkSize]; int[] nullIndexes = CoderUtil.getNullIndexes(inputs); assertEquals(2, nullIndexes[0]); assertEquals(3, nullIndexes[1]); assertEquals(4, nullIndexes[2]); assertEquals(5, nullIndexes[3]); assertEquals(6, nullIndexes[4]); }


In [11]:
train=df['train']
test=df['test']

In [12]:
train

Dataset({
    features: ['instruction', 'output', 'text'],
    num_rows: 78434
})

In [13]:
test

Dataset({
    features: ['instruction', 'output', 'text'],
    num_rows: 100
})

In [14]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model)

Loading tokenizer...


In [15]:
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    device={"":0}
    torch_type=torch.bfloat16
else:
    device="cpu"
    torch_type=torch.bfloat16
    print("I am begging for mercy already!")

CUDA device: NVIDIA GeForce RTX 3060 Ti


In [16]:
sdpa_kernel = "flash"
torch.backends.cuda.enable_mem_efficient_sdp(sdpa_kernel == "mem")
torch.backends.cuda.enable_flash_sdp(sdpa_kernel == "flash")
torch.backends.cuda.enable_math_sdp(sdpa_kernel == "math")

In [17]:
model = AutoModelForCausalLM.from_pretrained(base_model, device_map=device, torch_dtype=torch_type, attn_implementation="sdpa")

Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}


In [18]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32256, 2048)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm)

In [19]:
model.dtype

torch.bfloat16

In [20]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout = 0.1,
    target_modules="all-linear",
    bias='none',
    task_type="CAUSAL_LM"
)

In [21]:
lora_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules='all-linear', lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [22]:
print(f"BF16 support is {transformers.utils.import_utils.is_torch_bf16_gpu_available()}")   # must check

BF16 support is True




In [23]:
training_args = TrainingArguments(
    output_dir="./results",
    run_name ="./loggings",
    overwrite_output_dir=True,
    eval_strategy="steps",
    eval_steps=0.10,
    learning_rate=5e-4,
    gradient_accumulation_steps=4, 
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    num_train_epochs=1, # for now=1, later=4
    bf16=True,
    optim="adamw_torch_fused",
    save_strategy="no",
    log_level="info",
    logging_first_step=True,
    report_to='none', ## can be wandb, but we dont need right now!
    logging_steps=5,
    tf32=True,
    warmup_ratio=0,
    lr_scheduler_type="cosine",
    # torch_compile=True # Install pytorch nightly using conda first. x2 - x5 speed
)

In [24]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_args,
    packing=True,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Using auto half precision backend


In [25]:
for n, p in trainer.model.named_parameters():
    print(n, "-->", p.dtype, p.requires_grad)

base_model.model.model.embed_tokens.weight --> torch.bfloat16 False
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight --> torch.bfloat16 False
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight --> torch.float32 True
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight --> torch.float32 True
base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight --> torch.bfloat16 False
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight --> torch.float32 True
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight --> torch.float32 True
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight --> torch.bfloat16 False
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight --> torch.float32 True
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight --> torch.float32 True
base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight --> torch.bfloat16 False


In [26]:
!nvidia-smi

Sat Sep  7 21:52:23 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti   WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   44C    P8             18W /  240W |    3599MiB /   8192MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [27]:
trainer.train_dataset

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 50041
})

In [28]:
trainer.eval_dataset

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 66
})

In [None]:
!nvidia-smi

In [29]:
trainer.train()

***** Running training *****
  Num examples = 50,041
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 6,255
  Number of trainable parameters = 14,991,360
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  0%|          | 1/6255 [01:23<144:47:38, 83.35s/it]

{'loss': 1.2906, 'grad_norm': 0.3088058829307556, 'learning_rate': 0.000499920063948841, 'epoch': 0.0}


  0%|          | 5/6255 [06:34<131:13:08, 75.58s/it]

{'loss': 1.0269, 'grad_norm': 0.18496465682983398, 'learning_rate': 0.0004996003197442046, 'epoch': 0.0}


  0%|          | 7/6255 [09:44<145:51:05, 84.04s/it]

In [59]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

loading configuration file config.json from cache at C:\Users\walim\.cache\huggingface\hub\models--deepseek-ai--deepseek-coder-1.3b-instruct\snapshots\e063262dac8366fc1f28a4da0ff3c50ea66259ca\config.json
Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 32013,
  "eos_token_id": 32021,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5504,
  "max_position_embeddings": 16384,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_key_value_heads": 16,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": {
    "factor": 4.0,
    "rope_type": "linear",
    "type": "linear"
  },
  "rope_theta": 100000,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.2",
  "use_cach

('CODEX-deepseek-coder-1.3b-instruct\\tokenizer_config.json',
 'CODEX-deepseek-coder-1.3b-instruct\\special_tokens_map.json',
 'CODEX-deepseek-coder-1.3b-instruct\\tokenizer.json')