In [None]:
import os
from datasets import load_dataset
from transformers import AutoTokenizer
path = "dataset/"
data = load_dataset("json",data_files={"train":os.path.join(path,"train.jsonl"),"test":os.path.join(path,"valid.jsonl")})

In [2]:
from transformers import AutoTokenizer
basemodel = "../../../../Models/codellama/CodeLlama-7b-hf"
tokenzier = AutoTokenizer.from_pretrained(basemodel,model_max_length = 512)
tokenzier.pad_token = tokenzier.eos_token
#tokenzier.padding_side = "right"

In [3]:
tokenzier.add_special_tokens({'additional_special_tokens':['<|begin_of_java_code|>','<|end_of_java_code|>'\
                                                           ,'<|begin_of_c-sharp_code|>','<|end_of_c-sharp_code|>',\
                                                            '<|translate|>']})

5

In [4]:
tokenzier.save_pretrained("CodeLlama7bForCodeTransLoRA")

('CodeLlama7bForCodeTransLoRA/tokenizer_config.json',
 'CodeLlama7bForCodeTransLoRA/special_tokens_map.json',
 'CodeLlama7bForCodeTransLoRA/tokenizer.model',
 'CodeLlama7bForCodeTransLoRA/added_tokens.json',
 'CodeLlama7bForCodeTransLoRA/tokenizer.json')

In [5]:
tokenzier.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '</s>',
 'additional_special_tokens': ['<|begin_of_java_code|>',
  '<|end_of_java_code|>',
  '<|begin_of_c-sharp_code|>',
  '<|end_of_c-sharp_code|>',
  '<|translate|>']}

In [6]:
sourcelg = "java"
tgtlg = "cs"
prefix = tokenzier.special_tokens_map['additional_special_tokens'][0]

In [7]:
prefix

'<|begin_of_java_code|>'

In [8]:

def preprocess_function(examples):
    inputs = [prefix + example[sourcelg] + tokenzier.special_tokens_map['additional_special_tokens'][1] +\
               tokenzier.special_tokens_map['additional_special_tokens'][2]\
               + example[tgtlg] + tokenzier.special_tokens_map['additional_special_tokens'][3] for example in examples['translation']]
    model_inputs = tokenzier(inputs,padding="max_length",truncation=True)  
    return model_inputs

In [9]:
tokenzied_data = data.map(preprocess_function,batched=True,remove_columns=['id','translation'])

Map:   0%|          | 0/10295 [00:00<?, ? examples/s]

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

In [10]:
data['train']['translation'][1]

{'java': 'public UpdateJourneyStateResult updateJourneyState(UpdateJourneyStateRequest request) {request = beforeClientExecution(request);return executeUpdateJourneyState(request);}\n',
 'cs': 'public virtual UpdateJourneyStateResponse UpdateJourneyState(UpdateJourneyStateRequest request){var options = new InvokeOptions();options.RequestMarshaller = UpdateJourneyStateRequestMarshaller.Instance;options.ResponseUnmarshaller = UpdateJourneyStateResponseUnmarshaller.Instance;return Invoke<UpdateJourneyStateResponse>(request, options);}\n'}

In [11]:
tokenzier.decode(tokenzied_data['train'][2]['input_ids'])

'<s><|begin_of_java_code|> public void removePresentationFormat() {remove1stProperty(PropertyIDMap.PID_PRESFORMAT);}\n<|end_of_java_code|><|begin_of_c-sharp_code|> public void RemovePresentationFormat(){MutableSection s = (MutableSection)FirstSection;s.RemoveProperty(PropertyIDMap.PID_PRESFORMAT);}\n<|end_of_c-sharp_code|></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>

In [12]:
tokenzier.decode(tokenzied_data['test'][2]['input_ids'])

'<s><|begin_of_java_code|> public InsertInstanceRequest() {super("Ots", "2016-06-20", "InsertInstance", "ots");setMethod(MethodType.POST);}\n<|end_of_java_code|><|begin_of_c-sharp_code|> public InsertInstanceRequest(): base("Ots", "2016-06-20", "InsertInstance", "ots", "openAPI"){Method = MethodType.POST;}\n<|end_of_c-sharp_code|></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>

In [13]:
block_size = 512
def group_texts(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

In [14]:
tokenzied_data = tokenzied_data.map(group_texts,batched=True)

Map:   0%|          | 0/10295 [00:00<?, ? examples/s]

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

In [15]:
from transformers import AutoModelForCausalLM,TrainingArguments, Trainer
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_int8_training,
    PeftType,
    TaskType
)
import torch
peft_type = PeftType.LORA
config = LoraConfig(
        r=8,
        lora_alpha=16,
        inference_mode=False,
        lora_dropout=0.1,
        task_type=TaskType.CAUSAL_LM,
        target_modules=[
        "q_proj",
        "v_proj",
    ],
    )

In [16]:
from transformers import BitsAndBytesConfig
babcfig = BitsAndBytesConfig(load_in_8bit=True,llm_int8_enable_fp32_cpu_offload=True)
model = AutoModelForCausalLM.from_pretrained(basemodel,
        device_map = "cuda:0",
        quantization_config = babcfig)

model.resize_token_embeddings(len(tokenzier))
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, config)
model.print_trainable_parameters()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 4,194,304 || all params: 6,742,781,952 || trainable%: 0.06220435466930549




In [17]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
num_epochs = 5
training_args = TrainingArguments(
    output_dir="CodeLlama7bForCodeTransLoRA",
    save_strategy = "epoch",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 1,
    gradient_accumulation_steps= 4 ,
    weight_decay=0.01,
    num_train_epochs=num_epochs,
    warmup_steps=0.06 * (len(tokenzied_data['train']) * num_epochs),
    fp16=True,
    push_to_hub = True,
    logging_strategy="steps",
    logging_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    #report_to="none"
)


In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenzier,mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenzied_data["train"],
    tokenizer=tokenzier,
    data_collator=data_collator,
    eval_dataset = tokenzied_data['test']
)

In [20]:
trainer.train()
trainer.push_to_hub()

Epoch,Training Loss,Validation Loss
1,5.254,4.869515
2,4.0838,3.768993
3,3.3504,3.144504
4,2.9304,2.739771
5,2.6837,2.52364




BadRequestError:  (Request ID: Root=1-6640b7ad-08ec968b4808c61a3151fa4c;05b62c5b-59d4-4dfe-95c4-85281f26c7ef)

Bad request for commit endpoint:
"base_model" with value "../../../../Models/codellama/CodeLlama-7b-hf" is not valid. Use a model id from https://hf.co/models.

: 