In [1]:
from huggingface_hub import login

with open('key.txt', 'r') as file:
    key = file.readline().strip()

login(token=key)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
from transformers import set_seed

SEED = 13
set_seed(SEED)

MODEL = "deepseek-ai/deepseek-coder-1.3b-base"
DATASET = "Sam137/local-code"

2024-08-08 23:18:57.999356: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-08 23:18:57.999417: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-08 23:18:58.000652: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-08 23:18:58.007159: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from datasets import load_dataset
import torch
from tqdm import tqdm
import random

dataset = load_dataset(
    DATASET,
    data_files = ["python_code_filtered.csv"],
    split="train",
)

dataset = dataset.shuffle(seed=SEED)

DATA_COLUMN = 'Content'

max_length = max(len(sample[DATA_COLUMN]) for sample in dataset)
print(f'Maximum length of samples in the training set: {max_length}')
print("Training dataset samples:", len(dataset))
print(f"Sample {1521} of the training set: {dataset[1521]['Content']}.")

Maximum length of samples in the training set: 4255
Training dataset samples: 4790
Sample 1521 of the training set: #  Copyright 2023 haulogy
#  Part of ERPGIS. See LICENSE file for full copyright and licensing details.
from odoo import SUPERUSER_ID, api


def post_init(cr, registry):
    """
    Automatically import public holidays after installation of the module.
    :param cr: Odoo database cursor
    :param registry: Odoo registry
    """
    env = api.Environment(cr, SUPERUSER_ID, {})
    env["resource.calendar.leaves"].search([]).has_to_send_message()._send_day_off_msg()
    pass
.


In [4]:
from transformers import AutoTokenizer

SEQ_LENGTH = 1024

tokenizer = AutoTokenizer.from_pretrained(
    MODEL,
    trust_remote_code=True,
    add_eos_token=True,
    padding_side="right",
    use_fast=True
)

def tokenize_function(examples):
    tokenized_data = tokenizer(examples[DATA_COLUMN], padding="max_length", max_length=SEQ_LENGTH, truncation=True)
    tokenized_data['labels'] = tokenized_data['input_ids']
    return tokenized_data

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_data = split_dataset['train']
valid_data = split_dataset['test']

print(train_data)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4311
})


In [5]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from peft.tuners.lora import LoraLayer
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=getattr(torch, "float16"),
    bnb_4bit_use_double_quant=True,
)

device_map = {"": 0}

model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    load_in_8bit=False,
    quantization_config=bnb_config,
    device_map=device_map,
    use_cache=False,
    trust_remote_code=True,
)

print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32256, 2048)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaLinearScalingRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear4bit(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )

In [6]:
import bitsandbytes as bnb

def find_all_linear_names(model, bit4=True, bit8=True):
    LinearModule = torch.nn.Linear
    if bit4:
        LinearModule = bnb.nn.Linear4bit
    if bit8:
        LinearModule = bnb.nn.Linear8bitLt
        
    module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, LinearModule):
            if "lm_head" in module_names:
                continue
            names = name.split(".")
            module_names.add(names[0] if len(names) == 1 else names[-1])
    
    return list(module_names)

target_modules = find_all_linear_names(model, bit4=True, bit8=False)
print(target_modules)

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
print(model)

['k_proj', 'o_proj', 'v_proj', 'q_proj', 'down_proj', 'up_proj', 'gate_proj']
trainable params: 7,495,680 || all params: 1,353,967,616 || trainable%: 0.5536085140754209
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32256, 2048)
        (layers): ModuleList(
          (0-23): 24 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (base_laye

In [7]:
# Set up wandb
import wandb
import os
import torch
import torch.nn.functional as F

wandb.login(key="22943c48738b2f0aa5a6b37af531509b75a16960")

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     loss = F.cross_entropy(logits, labels).item()
#     return {"loss": loss}

# def preprocess_logits_for_metrics(logits, labels):
#     return logits

os.environ["WANDB_PROJECT"]="llm-finetune"
os.environ["WANDB_LOG_MODEL"]="true"
os.environ["WANDB_WATCH"]="false"

[34m[1mwandb[0m: Currently logged in as: [33msamueld[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [8]:
from transformers import Trainer, TrainingArguments

MAX_EPOCHS = 7
GR_ACC_STEPS = 1
LEARNING_RATE = 5e-4
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 30
SAVE_FREQ = 500
LOG_FREQ = 25
OUTPUT_DIR = "deepseek1.3-local-coder"

train_data.start_iteration = 0

training_args = TrainingArguments(
    output_dir=f"Sam137/{OUTPUT_DIR}",
    overwrite_output_dir=True,
    report_to = "wandb",
    dataloader_drop_last=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=MAX_EPOCHS,
    logging_steps=LOG_FREQ,
    adam_beta1 = 0.9,
    adam_beta2 = 0.95,
    lr_scheduler_type="cosine",
    warmup_steps=WARMUP_STEPS,
    gradient_accumulation_steps=GR_ACC_STEPS,
    gradient_checkpointing=True,
    fp16=True,
    learning_rate = LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    push_to_hub=True,
    auto_find_batch_size=True
)

trainer = Trainer(model=model, args=training_args, train_dataset=train_data, eval_dataset=valid_data)

print("Training...")
trainer.train()

Training...




Epoch,Training Loss,Validation Loss
1,0.1775,0.169795
2,0.1508,0.159922
3,0.1014,0.160144
4,0.0748,0.168914
5,0.0545,0.187025
6,0.0365,0.203529
7,0.0275,0.21505




adapter_model.safetensors:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

KeyError: 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight'

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f969844a650>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f9698295290, execution_count=8 error_before_exec=None error_in_exec='base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight' info=<ExecutionInfo object at 7f96982961d0, raw_cell="from transformers import Trainer, TrainingArgument.." store_history=True silent=False shell_futures=True cell_id=6e564391-ffbf-46e1-b02e-db3647121890> result=None>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [None]:
wandb.finish()