In [1]:

!python -m pip install --upgrade pip
!pip install -q -U bitsandbytes einops
!pip install -q -U git+https://github.com/huggingface/transformers.git 
!pip install -q -U git+https://github.com/huggingface/peft.git

!pip install -q -U git+https://github.com/huggingface/accelerate.git

!pip install -q -U accelerate
!pip install -q -U datasets

!pip install nvidia-ml-py3



In [3]:
import torch
torch.__version__

'1.13.1+cu117'

In [2]:
import transformers
transformers.__version__

'4.32.0.dev0'

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.8 -m pip install --upgrade pip[0m


In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [4]:
# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import load_dataset
import transformers


In [5]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

## QLORA 4 Bit Training


In [5]:
%%time
# Define the model name
model_name = "EleutherAI/gpt-j-6b"

# Load the model's tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the configuration for the quantizer
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the model in 4-bit
model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True, device_map={"":0})


# Prepare the model for LoRa, adding trainable adapters for each layer
model = prepare_model_for_kbit_training(model)

# Configuration for LoRa
config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["query_key_value"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM",
    
)

model = get_peft_model(model, config)
model.config.use_cache = False

# Load the dataset
data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

# Pad tokens to max length
tokenizer.pad_token = tokenizer.eos_token

# Training configuration
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        warmup_steps=2,
        max_steps=20,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        fp16_full_eval=True,
        output_dir="outputs",
        half_precision_backend = True,
        # optim="paged_adamw_8bit",
    
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# Train the model
result = trainer.train()

print(print_summary(result))


NameError: name 'AutoTokenizer' is not defined

In [7]:
import gc
del trainer, model, config, quant_config, data
gc.collect()
torch.cuda.empty_cache()

## QLORA + GRADIENT CHECKPOINTING TRAINING

In [8]:


# Define the model name
model_name = "tiiuae/falcon-7b"

# Load the model's tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the configuration for the quantizer
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the model in 4-bit

model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True, quantization_config=quant_config, device_map={"":0})

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Prepare the model for LoRa, adding trainable adapters for each layer
model = prepare_model_for_kbit_training(model)

# Configuration for LoRa
config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["query_key_value"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM",
    
)

model = get_peft_model(model, config)
model.config.use_cache = False

# Load the dataset
data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

# Pad tokens to max length
tokenizer.pad_token = tokenizer.eos_token

# Training configuration
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        warmup_steps=2,
        max_steps=20,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        fp16_full_eval=True,
        output_dir="outputs",
        half_precision_backend = True,
        # optim="paged_adamw_8bit",
    
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# Train the model
result = trainer.train()

print(print_summary(result))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,1.6189
2,2.1205



KeyboardInterrupt



In [None]:
import gc
del trainer, model, config, quant_config, data
gc.collect()
torch.cuda.empty_cache()

## Without Quantize + Gradient Checkpointing

In [None]:


# Define the model name
model_name = "tiiuae/falcon-7b"

# Load the model's tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Load the model in 4-bit
model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True, device_map={"":0})

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Prepare the model for LoRa, adding trainable adapters for each layer
model = prepare_model_for_kbit_training(model)

# Configuration for LoRa
config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["query_key_value"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM",
    
)

model = get_peft_model(model, config)
model.config.use_cache = False

# Load the dataset
data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

# Pad tokens to max length
tokenizer.pad_token = tokenizer.eos_token

# Training configuration
trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        warmup_steps=2,
        max_steps=20,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        fp16_full_eval=True,
        output_dir="outputs",
        half_precision_backend = True,
        # optim="paged_adamw_8bit",
    
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# Train the model
result = trainer.train()

print(print_summary(result))
