In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llama-2/transformers/default/3/adapter_model.safetensors
/kaggle/input/llama-2/transformers/default/3/adapter_config.json
/kaggle/input/llama-2/transformers/default/3/tokenizer.model


In [2]:
!pip install trl peft datasets accelerate bitsandbytes

Collecting trl
  Downloading trl-0.10.1-py3-none-any.whl.metadata (12 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.10-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading trl-0.10.1-py3-none-any.whl (280 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.1/280.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m12.0 MB

### Apply huggingface login for Dataset download

In [3]:
!huggingface-cli login --token <your-hugging-face-token>

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [5]:
model_name = "meta-llama/CodeLlama-7b-Instruct-hf"

#### Loading and Streaming a Dataset

In this code cell, we use the `datasets` library to load a large dataset from the "bigcode/the-stack" collection. Specifically, the dataset is filtered to include only Python files located in the `"data/python"` directory, and we load the `train` split. The `streaming=True` option allows us to efficiently handle this large dataset without needing to load everything into memory all at once.

To keep track of the number of files processed, we initialize a `file_count` variable and limit the process to a maximum of 80,000 files (`max_files`). The list `downloaded_samples` is created to store the files we retrieve.


In [6]:
from datasets import load_dataset

# Import the load_dataset function from the datasets library to load a specific dataset.
# Stream the dataset "bigcode/the-stack" filtered for the "python" directory, and load only the training split.
# Enable streaming to handle large datasets efficiently without loading the entire dataset into memory at once.
ds = load_dataset("bigcode/the-stack", data_dir="data/python", streaming=True, split="train")

# Initialize a counter for the number of files processed.
file_count = 0

# Set the maximum number of files to process (80,000 in this case).
max_files = 80000

# Create an empty list to store the downloaded samples.
downloaded_samples = []

Downloading readme:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/206 [00:00<?, ?it/s]

In [7]:
# Iterate over the dataset and stop after downloading the content
for sample in iter(ds):
    content = sample["content"]  # Accessing the file content
    downloaded_samples.append(content)
    file_count += 1
    if file_count >= max_files:
        break

print(f"Finished downloading {file_count} files.")

Finished downloading 80000 files.


#### Creating a Dataset from Downloaded Samples

In this code, we first import the `Dataset` and `DatasetDict` classes from the `datasets` library. These classes allow us to manipulate and create datasets in a structured manner.

We then create a dictionary `data_dict`, where the key is `"content"` and the value is the list of `downloaded_samples`. This list contains the samples we've processed from the original dataset. 

Using `Dataset.from_dict()`, we convert the dictionary into a `Dataset` object, which is a format used by Hugging Face's library to handle datasets efficiently. This new dataset can be further processed, explored, or saved.


In [11]:
from datasets import Dataset, DatasetDict

# Import Dataset and DatasetDict classes from the datasets library to create and manage datasets.

# Create a dictionary with the key "content" and assign the list of downloaded samples to it.
data_dict = {"content": downloaded_samples}

# Convert the dictionary into a Hugging Face Dataset object using Dataset.from_dict.
new_dataset = Dataset.from_dict(data_dict)


In [12]:
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Enable gradient checkpointing
gradient_checkpointing = True

# Load the entire model on the GPU 0
device_map = "auto"

#### Loading a Tokenizer and Model with QLoRA Configuration

This code demonstrates how to load a model using QLoRA (Quantized Low-Rank Adaptation) with a custom configuration for efficient memory usage and computation.

1. **Compute Type Setup**: The compute precision is set to `float16`, which allows for reduced memory usage and faster computations, especially on GPUs.

2. **BitsAndBytes Configuration**: The `BitsAndBytesConfig` object is created to load the model in 4-bit precision. This quantization reduces the size of the model while maintaining good performance. The `nf4` quantization type is used, and double quantization is disabled to avoid potential overhead.

3. **Loading the Model**: 
   - The base model is loaded using `AutoModelForCausalLM.from_pretrained()` with options for low CPU memory usage and efficient model loading.
   - The model is loaded in `float16` precision and mapped to the appropriate device (`device_map`) for inference (usually a GPU if available).

4. **Config Settings**:
   - Caching of model outputs is disabled (`use_cache = False`) to optimize fine-tuning.
   - Pretraining tensor parallelism (`pretraining_tp = 1`) is set to manage multi-GPU environments during model training or fine-tuning.

In [13]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# Set the compute precision to "float16" using PyTorch's built-in types.
compute_dtype = getattr(torch, "float16")

# Define the configuration for the BitsAndBytes quantization.
# Load the model in 4-bit precision with 'nf4' quantization type for efficient model size reduction.
# Set the computation precision to "float16" and disable double quantization for performance balance.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)

# Load the base model for causal language modeling from Hugging Face's library.
# Use the low CPU memory usage mode to reduce memory load during the loading process.
# Specify the compute precision (float16) and map the model to the appropriate device (CPU or GPU) for inference.
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
    quantization_config=bnb_config  # Pass the BitsAndBytes configuration here
)

# Disable caching for model outputs, and set pretraining tensor parallelism to 1 for more efficient fine-tuning.
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  50%|####9     | 4.94G/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

#### Loading the LLaMA Tokenizer and Tokenizing the Dataset

1. **Loading the Tokenizer**:
   - The `AutoTokenizer.from_pretrained()` function is used to load the tokenizer for the model (in this case, a LLaMA tokenizer).
   - The option `trust_remote_code=True` allows the loading of any custom tokenizer logic from the model's repository.

2. **Setting Padding Token**:
   - Since some models don't have an explicit padding token, we set the `pad_token` to the end-of-sequence token (`eos_token`) to manage padding during tokenization.
   - The `padding_side` is set to `"right"` to ensure the model doesn't encounter overflow issues during training with fp16 precision, which may occur if padding is on the left.

3. **Tokenization Function**:
   - The `tokenize_function` is defined to tokenize each example in the dataset. We use truncation to ensure that each tokenized sequence has a maximum length of 512 tokens.
   
4. **Applying Tokenization**:
   - We use the `.map()` method to apply the tokenization function to the entire dataset. Setting `batched=True` processes multiple examples at once, which speeds up the tokenization process.

In [14]:
# Load LLaMA tokenizer from the specified model name
# 'trust_remote_code=True' ensures that custom tokenization logic from the model repository is trusted.
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set the pad token to be the same as the end-of-sequence (EOS) token to handle padding properly.
tokenizer.pad_token = tokenizer.eos_token

# Define the padding side as "right" to avoid issues during fp16 training where overflow might occur.
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

# Define a function that tokenizes each example in the dataset, truncating them to a maximum length of 512 tokens.
def tokenize_function(examples):
    return tokenizer(examples['content'], truncation=True, max_length=512)

# Apply the tokenization function to the entire dataset using map for batched processing to improve speed.
tokenized_dataset = new_dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

#### Loading LoRA Configuration and Fine-Tuning the Model

1. **LoRA Configuration**:
   - LoRA (Low-Rank Adaptation) is a technique that reduces the number of trainable parameters by introducing low-rank matrices into certain layers of the model. 
   - We define `LoraConfig` with key parameters like:
     - `lora_alpha=16`: A scaling factor to adjust the importance of LoRA layers.
     - `lora_dropout=0.1`: Dropout applied to LoRA layers during training to prevent overfitting.
     - `r=64`: The rank for the low-rank matrices.
     - `task_type="CAUSAL_LM"`: Specifies the task as causal language modeling.

2. **Training Arguments**:
   - The `TrainingArguments` class defines the parameters for the training process:
     - **Batch size**: Set to 4 samples per device.
     - **Optimizer**: `paged_adamw_32bit`, a memory-efficient variant of AdamW.
     - **Learning rate**: Set to `2e-4`.
     - **Mixed precision**: Use either `fp16` or `bf16` for faster and more efficient training on supported hardware.
     - **Gradient clipping**: Ensures stable gradients with `max_grad_norm=0.3`.
     - **Logging and saving**: Logs training progress every 25 steps but does not save intermediate checkpoints.
     - **Cosine learning rate scheduler**: Provides smooth learning rate decay over time.

3. **Supervised Fine-Tuning (SFT) Setup**:
   - `SFTTrainer` is initialized with the pre-trained model, tokenized dataset, and LoRA configuration for parameter-efficient fine-tuning.
   - `dataset_text_field="content"` specifies the text field in the dataset, while `packing=False` disables sequence packing.

4. **Training**:
   - The `trainer.train()` call starts the training process with the defined configuration and parameters

In [15]:
# Load LoRA (Low-Rank Adaptation) configuration for fine-tuning the model
# LoRA parameters include alpha scaling, dropout rate, rank (r), and bias handling.
peft_config = LoraConfig(
    lora_alpha=16,             # Scaling factor for LoRA layers
    lora_dropout=0.1,          # Dropout rate applied to LoRA layers during training
    r=64,                      # Rank of the low-rank adaptation matrices
    bias="none",               # No additional bias parameters
    task_type="CAUSAL_LM",      # The task type for causal language modeling
)

# Set the training arguments for fine-tuning
training_arguments = TrainingArguments(
    output_dir="./results",             # Directory to save the model and other results
    num_train_epochs=1,                 # Number of training epochs
    per_device_train_batch_size=4,      # Batch size per device (GPU/CPU)
    gradient_accumulation_steps=1,      # Number of steps to accumulate gradients before updating weights
    optim="paged_adamw_32bit",          # Optimizer: paged AdamW, a memory-efficient variant
    save_steps=0,                       # Do not save checkpoints during training
    logging_steps=25,                   # Log training metrics every 25 steps
    learning_rate=2e-4,                 # Learning rate for the optimizer
    weight_decay=0.001,                 # Weight decay to regularize the model
    fp16=fp16,                          # Enable mixed precision training with fp16 (if available)
    bf16=bf16,                          # Enable bf16 precision training (if supported by hardware)
    max_grad_norm=0.3,                  # Maximum gradient norm for gradient clipping
    max_steps=1500,                     # Maximum number of training steps
    warmup_ratio=0.03,                  # Warmup ratio for learning rate schedule
    group_by_length=True,               # Group samples by length for more efficient training
    lr_scheduler_type="cosine",         # Use cosine learning rate scheduler
    report_to="tensorboard",            # Report training progress to TensorBoard
)

# Initialize the supervised fine-tuning trainer (SFTTrainer) with the model, dataset, and LoRA configuration
trainer = SFTTrainer(
    model=base_model,                   # Pre-trained model to fine-tune
    train_dataset=tokenized_dataset,    # Tokenized training dataset
    peft_config=peft_config,            # LoRA configuration for parameter-efficient fine-tuning
    dataset_text_field="content",       # Field in the dataset containing the text content
    max_seq_length=None,                # No fixed sequence length for input data
    tokenizer=tokenizer,                # Tokenizer used for processing the data
    args=training_arguments,            # Training arguments specified earlier
    packing=False,                      # Disable sequence packing (padding)
)

# Start training the model
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
25,0.787
50,0.8254
75,0.7469
100,0.8987
125,0.8069
150,0.7754
175,0.7263
200,0.8609
225,0.7741
250,0.7819


TrainOutput(global_step=1500, training_loss=0.7948865458170573, metrics={'train_runtime': 5385.8626, 'train_samples_per_second': 1.114, 'train_steps_per_second': 0.279, 'total_flos': 9.981113253750374e+16, 'train_loss': 0.7948865458170573, 'epoch': 0.075})

In [16]:
# Save trained model
trainer.model.save_pretrained("Lora-Llama-2-matrices-before-code-grader")

In [21]:
new_model="/kaggle/input/lora-llama-2-matrices-before-code-grader/pytorch/default/1"

#### Merging LoRA Weights
   - We use `PeftModel.from_pretrained()` to load the LoRA-augmented model, combining the base model with the fine-tuned LoRA weights.
   - The method `merge_and_unload()` merges the LoRA layers back into the base model, removing the need to maintain separate LoRA layers, which simplifies inference and reduces memory consumption.


In [22]:
# Load the LoRA-augmented model by combining the base model with the fine-tuned LoRA weights.
model = PeftModel.from_pretrained(base_model, new_model)

# Merge the LoRA weights into the base model and unload them to free up memory.
model = model.merge_and_unload()




In [25]:
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = """Write the feedback of this model print("hello world")"""
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

[INST] Write the feedback of this model print("hello world") [/INST]  The feedback for the model print("hello world") is:

* The model is a simple print statement that prints the string "hello world" to the console.
* The model is a good example of a basic Python program that can be used to print a string to the console.
* The model is well-structured and easy to understand, with a clear and concise syntax.
* The model is a good starting point for learning Python programming, as it covers the basic syntax and functionality of the language.
* The model is a good example of how to use the `print()` function in Python to print a string to the console.
* The model is a good example of how to use the `"` character to enclose a string in Python.

Overall, the model is a good example of a basic Python program that can be used to print


In [51]:
new_merged_model = "CodeLlama-2-fine-tuned-code-grader"

In [27]:
# Save the merged model
model.save_pretrained(new_merged_model)

# Save the tokenizer
tokenizer.save_pretrained(new_merged_model)

('CodeLlama-2-fine-tuned-code-grader/tokenizer_config.json',
 'CodeLlama-2-fine-tuned-code-grader/special_tokens_map.json',
 'CodeLlama-2-fine-tuned-code-grader/tokenizer.model',
 'CodeLlama-2-fine-tuned-code-grader/added_tokens.json',
 'CodeLlama-2-fine-tuned-code-grader/tokenizer.json')