In [1]:
from huggingface_hub import login

with open('key.txt', 'r') as file:
    key = file.readline().strip()

login(token=key)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
from transformers import set_seed

SEED = 10107
set_seed(SEED)

MODEL = "deepseek-ai/deepseek-coder-6.7b-instruct"
DATASET = "neulab/conala"

2024-08-15 14:18:33.518212: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-15 14:18:33.518323: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-15 14:18:33.598389: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-15 14:18:33.800717: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
#This set of utilities function and classes was imported 
#from https://github.com/deepseek-ai/DeepSeek-Coder/blob/main/finetune/finetune_deepseekcoder.py

import transformers
import torch
import copy
from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence

IGNORE_INDEX = -100
EOT_TOKEN = "<|EOT|>"

def build_instruction_prompt(instruction: str):
    return '''
You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.
### Instruction:
{}
### Response:
'''.format(instruction.strip()).lstrip()

def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    """Tokenize a list of strings."""
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        )
        for text in strings
    ]

    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
    ]

    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )


def preprocess(
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    """Preprocess the data by tokenizing."""
    examples = [s + t for s, t in zip(sources, targets)]
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
    input_ids = examples_tokenized["input_ids"]

    labels = copy.deepcopy(input_ids)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)

@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""
    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = [torch.tensor(x) for x in input_ids]
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = [torch.tensor(x) for x in labels]
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

def train_tokenize_function(examples, tokenizer):
    sources = [
        build_instruction_prompt(instruction)
        for instruction in examples['instruction']
    ]
    targets = [f"{output}\n{EOT_TOKEN}" for output in examples['output']]
    data_dict = preprocess(sources, targets, tokenizer)
    return data_dict


In [4]:
from datasets import load_dataset
import torch
from tqdm import tqdm
import random

dataset = load_dataset(
    DATASET,
)

dataset = dataset.shuffle(seed=SEED)
print(dataset)

valid_data = dataset['test']
train_data = dataset['train']

def build_prompt(code: str):
    return f"Below is a line of python code that describes a task. Write one line of summary that appropriately describes the task that the code is performing.\n{code}"

def format_dataset(dataset):
    column_names = dataset.column_names
    column_names.remove("rewritten_intent")
    instructions = [
        build_prompt(code) for code in dataset['snippet']
    ]

    dataset = dataset.add_column('instruction', instructions)
    dataset = dataset.rename_column("rewritten_intent", "output")
    dataset = dataset.remove_columns(column_names)
    return dataset

train_data = format_dataset(train_data)
valid_data = format_dataset(valid_data)
print(train_data)
print(valid_data)

Downloading builder script:   0%|          | 0.00/4.30k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/518k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/109k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question_id', 'intent', 'rewritten_intent', 'snippet'],
        num_rows: 2379
    })
    test: Dataset({
        features: ['question_id', 'intent', 'rewritten_intent', 'snippet'],
        num_rows: 500
    })
})


Flattening the indices:   0%|          | 0/2379 [00:00<?, ? examples/s]

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


Flattening the indices:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['output', 'instruction'],
    num_rows: 2379
})
Dataset({
    features: ['output', 'instruction'],
    num_rows: 500
})


In [5]:
from transformers import AutoTokenizer

SEQ_LENGTH = 2048

tokenizer = AutoTokenizer.from_pretrained(
    MODEL,
    trust_remote_code=True,
    padding_side="right",
    use_fast=True
)

train_data = train_data.map(
    train_tokenize_function,
    batched=True,
    remove_columns=train_data.column_names,
    fn_kwargs={ "tokenizer": tokenizer }
)

valid_data = valid_data.map(
    train_tokenize_function,
    batched=True,
    remove_columns=valid_data.column_names,
    fn_kwargs={ "tokenizer": tokenizer }
)

print(train_data)
index=4
print(f"Sample {index} of the training set: {train_data[index]['input_ids']}, {train_data[index]['labels']}.")
print(f"Sample {index} of the training set: {tokenizer.decode(list(train_data[index]['input_ids']))}.")

data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
data_module = dict(train_dataset=train_data, eval_dataset=valid_data, data_collator=data_collator)

tokenizer_config.json:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Map:   0%|          | 0/2379 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 2379
})
Sample 4 of the training set: [32013, 2042, 417, 274, 20926, 14244, 20391, 11, 26696, 254, 20676, 8041, 74, 339, 8589, 2008, 11, 6908, 457, 20676, 8041, 74, 7958, 11, 285, 340, 885, 3495, 4301, 4512, 276, 4531, 8214, 13, 1487, 4636, 2223, 13143, 4301, 11, 5411, 285, 13936, 4447, 11, 285, 746, 2159, 12, 13517, 250, 8214, 4301, 11, 340, 540, 20857, 276, 3495, 13, 185, 13518, 3649, 3475, 25, 185, 27564, 317, 245, 1348, 280, 9942, 2974, 344, 13025, 245, 5256, 13, 17437, 629, 1348, 280, 13602, 344, 30542, 13025, 254, 5256, 344, 254, 2974, 317, 13697, 13, 185, 248, 13, 1580, 1497, 58, 61, 15, 12, 24, 60, 1183, 23460, 651, 30026, 73, 71, 24, 23, 22, 24, 22, 23, 281, 67, 15, 24, 23, 281, 15, 24, 23, 15, 64, 24, 23, 8850, 2462, 185, 13518, 21289, 25, 185, 8680, 519, 2159, 12, 14015, 278, 7445, 473, 2649, 2220, 30026, 73, 71, 24, 23, 22, 24, 22, 23, 281, 67, 15, 24, 23, 281, 15, 24, 23, 15, 64, 24, 23, 8850, 2220, 185, 32021]

In [6]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from peft.tuners.lora import LoraLayer
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=getattr(torch, "float16"),
    bnb_4bit_use_double_quant=True,
)

device_map = {"": 0}

model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    load_in_8bit=False,
    quantization_config=bnb_config,
    device_map=device_map,
    use_cache=False,
    trust_remote_code=True,
)

print(model)

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaLinearScalingRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
  

In [7]:
import bitsandbytes as bnb

def find_all_linear_names(model, bit4=True, bit8=True):
    LinearModule = torch.nn.Linear
    if bit4:
        LinearModule = bnb.nn.Linear4bit
    if bit8:
        LinearModule = bnb.nn.Linear8bitLt
        
    module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, LinearModule):
            if "lm_head" in module_names:
                continue
            names = name.split(".")
            module_names.add(names[0] if len(names) == 1 else names[-1])
    
    return list(module_names)

target_modules = find_all_linear_names(model, bit4=True, bit8=False)
print(target_modules)

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    lora_alpha=8,
    lora_dropout=0.05,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
print(model)

['v_proj', 'o_proj', 'q_proj', 'down_proj', 'k_proj', 'gate_proj', 'up_proj']
trainable params: 19,988,480 || all params: 6,760,501,248 || trainable%: 0.2956656506189288
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (base_lay

In [8]:
# Set up wandb
import wandb
import os
import torch
import torch.nn.functional as F

wandb.login(key="22943c48738b2f0aa5a6b37af531509b75a16960")

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     loss = F.cross_entropy(logits, labels).item()
#     return {"loss": loss}

# def preprocess_logits_for_metrics(logits, labels):
#     return logits

os.environ["WANDB_PROJECT"]="llm-finetune"
os.environ["WANDB_LOG_MODEL"]="true"
os.environ["WANDB_WATCH"]="false"

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [9]:
from transformers import Trainer, TrainingArguments

MAX_EPOCHS = 20
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 0.01
WARMUP_STEPS = 100
LOG_FREQ = 10
BATCH_SIZE = 32
OUTPUT_DIR = "deepseek6.7-explain-coder"
SCHEDULER = "cosine_with_restarts"

training_args = TrainingArguments(
    output_dir=f"Sam137/{OUTPUT_DIR}",
    overwrite_output_dir=True,
    report_to = "wandb",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=MAX_EPOCHS,
    logging_steps=LOG_FREQ,
    adam_beta1 = 0.9,
    adam_beta2 = 0.95,
    lr_scheduler_type=SCHEDULER,
    warmup_steps=WARMUP_STEPS,
    gradient_checkpointing=True,
    bf16=True,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    push_to_hub=True,
    auto_find_batch_size=True
)

trainer = Trainer(model=model, args=training_args, **data_module)

print("Training...")
trainer.train()

Training...


[34m[1mwandb[0m: Currently logged in as: [33msamueld[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
1,1.1552,1.149638
2,0.9994,1.016063
3,0.9245,0.998575
4,0.8172,1.047309
5,0.6281,1.144523
6,0.4334,1.291417
7,0.266,1.544163
8,0.1385,1.79115
9,0.0756,2.014782
10,0.0524,2.108684




TrainOutput(global_step=1500, training_loss=0.3144246920359631, metrics={'train_runtime': 7966.8445, 'train_samples_per_second': 5.972, 'train_steps_per_second': 0.188, 'total_flos': 3.345045978298614e+17, 'train_loss': 0.3144246920359631, 'epoch': 20.0})

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f9160a12590>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f9161b5a2d0, execution_count=9 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7f91f8d78390, raw_cell="from transformers import Trainer, TrainingArgument.." store_history=True silent=False shell_futures=True cell_id=099f090f-110d-440c-aa5b-62fe121839d7> result=TrainOutput(global_step=1500, training_loss=0.3144246920359631, metrics={'train_runtime': 7966.8445, 'train_samples_per_second': 5.972, 'train_steps_per_second': 0.188, 'total_flos': 3.345045978298614e+17, 'train_loss': 0.3144246920359631, 'epoch': 20.0})>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [10]:
wandb.finish()

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f9160a12590>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7f91585219d0, raw_cell="wandb.finish()" store_history=True silent=False shell_futures=True cell_id=5bc3d0ec-8fc8-419d-b9b2-798fccbaa6d7>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

0,1
eval/loss,▂▁▁▁▂▂▃▄▅▅▆▆▆▇▇▇████
eval/runtime,▄▆▄▁▁▁▆█▃▂▁▂▄▃▃▅▆▅▅▃
eval/samples_per_second,▅▃▅███▃▁▆▇█▇▅▆▆▄▃▄▄▆
eval/steps_per_second,▆▃▆███▃▁▆███▆█▆▃▃▆▆▆
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▂▄▇██████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
train/loss,█▆▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.86819
eval/runtime,24.1512
eval/samples_per_second,20.703
eval/steps_per_second,0.662
train/epoch,20.0
train/global_step,1500.0
train/learning_rate,0.0
train/loss,0.0025
train/total_flos,3.345045978298614e+17
train/train_loss,0.31442
