In [1]:
# %%capture
# %pip install -U accelerate peft bitsandbytes transformers trl evaluate attrdict tqdm datasets
# --OR--
# %%capture
# %pip install -r requirements.txt

In [2]:
import os
import torch
import json
import shutil
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel, 
    PeftConfig
)
from evaluate import load
from trl import SFTTrainer
from datasets import Dataset, load_dataset
from huggingface_hub import login, Repository

: 

In [3]:
model = "deepseek-coder-1.3b-instruct"  # model to be fine-tuned

base_model = f"deepseek-ai/{model}" 

new_model = f"CODEX-{model}"  # fine-tunned model name

dataset_path = "dataset"  # dataset dir path

dataset = "CodexAI/Deepseek-Coder"  # dataset name at huggingface

repo_url = f'https://huggingface.co/datasets/{dataset}'

In [None]:
login('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',add_to_git_credential=True)

In [6]:
repo = Repository(local_dir=dataset_path,clone_from=repo_url)

In [7]:
def load_json_data(dir_name):

  data=[]
  for root_folder in os.listdir(dir_name):
    if root_folder!=".git" and root_folder!=".gitattributes":
      for files in os.listdir(os.path.join(dir_name,root_folder)):
        if files.endswith(".json"):
          with open(os.path.join(dir_name,root_folder,files),"r")as f:
            json_file=json.load(f)
            data.append(json_file)
  return data

In [8]:
print(f"Loading dataset from ./{dataset_path}/")
json_data=load_json_data(dataset_path)
print(f"Length of loaded dataset is: {len(json_data)}")

Loading dataset from /dataset/...


In [10]:
tmp=json_data  # in case if this is required again

In [11]:
json_data=json_data[:1000]
print(f"Length of dataset is: {len(json_data)}")

Length of dataset is: 1000


In [12]:
print("Loading dataset...")
df=Dataset.from_list(json_data)
print(df)

Loading dataset...


In [13]:
df

Dataset({
    features: ['instruction', 'output'],
    num_rows: 1000
})

In [14]:
df.features

{'instruction': Value(dtype='string', id=None),
 'output': Value(dtype='string', id=None)}

In [15]:
print(df['instruction'][0])

'Generate a unit test case for the following Java method: SourceFileResolver { public static File resolveSourceFile( List<String> sourcePaths, String sourceFileName, String groupId, String artifactId ) { return resolveSourceFile( sourceFileName, PathUtil.filesList( sourcePaths ), groupId, artifactId ); }  static File resolveSourceFile( List<String> sourcePaths, String sourceFileName, String groupId,\n                                          String artifactId ); static File resolveSourceFile( List<String> sourcePaths, String sourceFile ); static File resolveSourceFile( String sourceFileName, List<File> sourceRoots ); static File resolveSourceFile( String sourceFileName, List<File> sourceRoots, String groupId,\n                                          String artifactId );  }'

In [16]:
print(df['output'][0])

'The unit test case for the given Java method is: @Test public void resolveMultipleRoots() { File file = SourceFileResolver.resolveSourceFile( null, getDir( "nroots/root1", "nroots/root2", "nroots/root3" ), null, null ); Assert.assertTrue( file.exists() ); MatcherAssert.assertThat( file.getName(), CoreMatchers.equalTo( "root.as" ) ); }'

In [17]:
print("Spliting dataset...")
df=df.train_test_split(test_size=0.2)
print(df)

Spliting dataset...


In [19]:
train=df['train']
test=df['test']

In [20]:
print(train)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 800
})

In [21]:
print(test)

Dataset({
    features: ['instruction', 'output'],
    num_rows: 200
})

In [22]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model)

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

In [23]:
instruction = tokenizer(train['instruction'][0])
print(instruction)

{'input_ids': [32013, 7605, 387, 245, 5621, 1719, 1452, 327, 254, 1884, 9840, 2040, 25, 380, 6327, 508, 7207, 507, 1171, 12353, 7928, 1270, 3314, 1013, 51, 8121, 380, 6327, 508, 29, 323, 3341, 938, 10647, 7, 10647, 270, 8, 8474, 13147, 10647, 508, 3305, 507, 967, 3341, 938, 10647, 7, 65, 13, 24657, 7, 10647, 508, 13, 6787, 57, 650, 270, 477, 611, 207, 1171, 12353, 7928, 3314, 323, 3341, 938, 10647, 7, 10647, 270, 477, 1171, 12353, 7928, 1171, 6159, 1476, 54, 1661, 787, 1195, 309, 19791, 2456, 3314, 323, 3341, 938, 10647, 7, 3667, 27, 51, 29, 495, 82, 11, 380, 6327, 270, 477, 1171, 12353, 7928, 1171, 6159, 1476, 54, 1661, 787, 1195, 309, 19791, 2456, 3314, 323, 3341, 938, 10647, 7, 2005, 1208, 11, 380, 6327, 270, 477, 207, 611], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [24]:
tokens = tokenizer.convert_ids_to_tokens(instruction.input_ids)
print(tokens)

['<｜begin▁of▁sentence｜>', 'Gener', 'ate', 'Ġa', 'Ġunit', 'Ġtest', 'Ġcase', 'Ġfor', 'Ġthe', 'Ġfollowing', 'ĠJava', 'Ġmethod', ':', 'ĠB', 'undle', 'able', 'Util', 'Ġ{', 'Ġ@', 'Non', 'Null', 'Ġpublic', 'Ġstatic', 'Ġ<', 'T', 'Ġextends', 'ĠB', 'undle', 'able', '>', 'ĠT', 'Ġmaterial', 'ize', 'Bundle', '(', 'Bundle', 'Ġb', ')', 'Ġthrows', 'ĠBad', 'Bundle', 'able', 'Exception', 'Ġ{', 'Ġreturn', 'Ġmaterial', 'ize', 'Bundle', '(', 'b', '.', 'getString', '(', 'Bundle', 'able', '.', 'CL', 'Z', '),', 'Ġb', ');', 'Ġ}', 'Ġ', 'Ġ@', 'Non', 'Null', 'Ġstatic', 'ĠT', 'Ġmaterial', 'ize', 'Bundle', '(', 'Bundle', 'Ġb', ');', 'Ġ@', 'Non', 'Null', 'Ġ@', 'Sup', 'press', 'W', 'arn', 'ings', '("', 'un', 'checked', '")', 'Ġstatic', 'ĠT', 'Ġmaterial', 'ize', 'Bundle', '(', 'Class', '<', 'T', '>', 'Ġcl', 's', ',', 'ĠB', 'undle', 'Ġb', ');', 'Ġ@', 'Non', 'Null', 'Ġ@', 'Sup', 'press', 'W', 'arn', 'ings', '("', 'un', 'checked', '")', 'Ġstatic', 'ĠT', 'Ġmaterial', 'ize', 'Bundle', '(', 'String', 'Ġname', ',', 'ĠB', 'un

In [25]:
tokenizer.convert_tokens_to_string(tokens)



In [26]:
print(f"Vocab size : {tokenizer.vocab_size}")
print(f"max length : {tokenizer.model_max_length}")
print(f"model input : {tokenizer.model_input_names}")

Vocab size : 32000


In [29]:
batch = tokenizer(train['instruction'][0],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
print(batch)

In [31]:
# tmp=Dataset.from_list(tmp)
# tmp

Dataset({
    features: ['instruction', 'output'],
    num_rows: 78534
})

In [32]:
# # Define the build_instruction_prompt function
# def build_instruction_prompt(instruction: str):
#     return '''
# You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.
# ### Instruction:
# {}
# ### Response:
# '''.format(instruction.strip()).lstrip()

# # Define the EOT_TOKEN
# EOT_TOKEN = "<|EOT|>"

In [34]:
def tokenize_data(data):
  input_col=tokenizer(data['instruction'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")
  target_col=tokenizer(data['output'],max_length=512,truncation=True,padding="max_length",return_tensors="pt")

  return {
      "input_ids":input_col["input_ids"],
      "attention_mask":input_col["attention_mask"],
      "labels":target_col["input_ids"]
  }

In [90]:
tokenizer.padding_side = "right"
print("Tokenizing dataset...")

Tokenizing dataset...


In [35]:
print("Mapping train data...")
train=train.map(tokenize_data,batched=True)
print(train)

Mapping train data...


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [37]:
print("Mappig test data...")
test=test.map(tokenize_data,batched=True)
print(test)

Mappig test data...


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [39]:
train=train.remove_columns(["instruction","output"])
test=test.remove_columns(["instruction","output"])

In [40]:
print(train)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 800
})

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'trainable model parameters: {trainable_model_params}\n \
            all model parameters: {all_model_params} \n \
            percentage of trainable model parameters: {(trainable_model_params / all_model_params) * 100} %'

In [41]:
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    device={"":0}
    torch_type=torch.bfloat16
else:
    device="cpu"
    torch_type=torch.bfloat16
    print("I am begging for mercy already!")

CUDA device: Tesla P100-PCIE-16GB


In [58]:
model = AutoModelForCausalLM.from_pretrained(base_model,device_map=device)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-1.3b-instruct/snapshots/e063262dac8366fc1f28a4da0ff3c50ea66259ca/config.json
Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}
Model config LlamaConfig {
  "_name_or_path": "deepseek-ai/deepseek-coder-1.3b-instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 32013,
  "eos_token_id": 32021,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5504,
  "max_position_embeddings": 16384,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_key_value_heads": 16,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": {
    "factor": 4.0,
    "rope_type": "linear",
    "type": "linear"
  },
  "rope_theta": 100000,
  "tie_word_embeddings": false,
  "torch_dtype": "bflo

In [59]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32256, 2048)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm)

In [60]:
print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 1346471936
             all model parameters: 1346471936 
             percentage of trainable model parameters: 100.0 %


**Nested quantization**
For enabling nested quantization, use the bnb_4bit_use_double_quant argument in BitsAndBytesConfig. This will enable a second quantization after the first one to save an additional 0.4 bits per parameter.

In [61]:
nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model_nf4 = AutoModelForCausalLM.from_pretrained(base_model, quantization_config=nf4_config)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-1.3b-instruct/snapshots/e063262dac8366fc1f28a4da0ff3c50ea66259ca/config.json
Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}
Model config LlamaConfig {
  "_name_or_path": "deepseek-ai/deepseek-coder-1.3b-instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 32013,
  "eos_token_id": 32021,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5504,
  "max_position_embeddings": 16384,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_key_value_heads": 16,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": {
    "factor": 4.0,
    "rope_type": "linear",
    "type": "linear"
  },
  "rope_theta": 100000,
  "tie_word_embeddings": false,
  "torch_dtype": "bflo

In [62]:
print(model_nf4)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32256, 2048)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear4bit(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-

In [63]:
print(print_number_of_trainable_model_parameters(model_nf4))

trainable model parameters: 132220928
             all model parameters: 739346432 
             percentage of trainable model parameters: 17.883487669282484 %


In [46]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout = 0.1,
    bias='none',
    task_type="CAUSAL_LM"
)

In [64]:
peft_model = get_peft_model(model_nf4, lora_config)
print(peft_model)

In [66]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 6291456
             all model parameters: 745637888 
             percentage of trainable model parameters: 0.8437682823327776 %


In [50]:
print(f"BF16 support is {transformers.utils.import_utils.is_torch_bf16_gpu_available()}")

BF16 support is True




In [91]:
training_args = TrainingArguments(
    output_dir="./results",
    run_name ="./loggings",
    overwrite_output_dir=True,
    eval_strategy="steps",
    learning_rate=5e-5, # default, change to 1e-3 on epoch>4
    gradient_accumulation_steps=1, # if CUDA out of memory then = 3,4
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
    auto_find_batch_size = True, # for CUDA out of memory 
    weight_decay=0.01,
    num_train_epochs=1, # test=1, min=4, max=10
    bf16=True,
    optim="adamw_torch",
    save_strategy="no",
    log_level="info",
    logging_first_step=True,
    report_to='none' ## can be wandb, but we dont need right now!
)

PyTorch: setting up devices


In [93]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    eval_dataset=test,
    # peft_config=lora_config,
    dataset_text_field="text",
    # max_seq_length=None,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
)

PyTorch: setting up devices
Using auto half precision backend


In [94]:
trainer.train()

***** Running training *****
  Num examples = 800
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  Number of trainable parameters = 6,291,456
***** Running training *****
  Num examples = 800
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Training with DataParallel so batch size has been adjusted to: 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 6,291,456


Epoch,Training Loss,Validation Loss
1,1.2924,1.043586



***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=200, training_loss=1.21825543820858, metrics={'train_runtime': 504.8575, 'train_samples_per_second': 1.585, 'train_steps_per_second': 0.396, 'total_flos': 3162201548390400.0, 'train_loss': 1.21825543820858, 'epoch': 1.0})

In [96]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--deepseek-ai--deepseek-coder-1.3b-instruct/snapshots/e063262dac8366fc1f28a4da0ff3c50ea66259ca/config.json
Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 32013,
  "eos_token_id": 32021,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5504,
  "max_position_embeddings": 16384,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_key_value_heads": 16,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": {
    "factor": 4.0,
    "rope_type": "linear",
    "type": "linear"
  },
  "rope_theta": 100000,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.2",
  "use_cache": true,

('CODEX-deepseek-coder-1.3b-instruct/tokenizer_config.json',
 'CODEX-deepseek-coder-1.3b-instruct/special_tokens_map.json',
 'CODEX-deepseek-coder-1.3b-instruct/tokenizer.json')

In [97]:
# torch.cuda.empty_cache()  # release CUDA memory