#### This part showcases that the patched linear layer and nn.Linear produces almost equivalent results while using LLaMA 3.2 1B model 

In [1]:
%%capture
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model  # Import LoRA utilities

max_seq_length = 1024
model_name = "unsloth/Llama-3.2-1B"

# Load your model and tokenizer as before
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    attn_implementation="sdpa"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "right"

model.enable_input_require_grads()

# Load a portion of your dataset
url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
dataset = load_dataset("json", data_files={"train": url}, split="train[:10%]")

# Print each layer's name along with its trainable status.
for name, param in model.named_parameters():
    if "model.layers.15" not in name:
        param.requires_grad = False

# Optionally, summarize the total number of trainable parameters vs. total parameters.
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"\nTrainable parameters: {trainable_params} / Total parameters: {total_params}")


# Setup the trainer with the LoRA-applied model
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    processing_class=tokenizer,
    args=SFTConfig(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=2,
        warmup_steps=1,
        max_steps=25,
        logging_steps=1,
        output_dir="outputs",
        seed=3407,
        max_seq_length=max_seq_length,
        report_to="none",  # Disable reporting (e.g., for W&B)
        dataset_num_proc=4,
    ),
)

# Begin training with LoRA applied
trainer.train()


config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

unified_chip2.jsonl:   0%|          | 0.00/95.6M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Trainable parameters: 60821504 / Total parameters: 1235814400


Converting train dataset to ChatML (num_proc=4):   0%|          | 0/21029 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=4):   0%|          | 0/21029 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=4):   0%|          | 0/21029 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=4):   0%|          | 0/21029 [00:00<?, ? examples/s]

Step,Training Loss
1,3.2304
2,4.999
3,5.0006
4,6.0348
5,4.0794
6,5.1938
7,3.433
8,2.872
9,4.2294
10,4.2066


TrainOutput(global_step=25, training_loss=4.1112366771698, metrics={'train_runtime': 24.5804, 'train_samples_per_second': 2.034, 'train_steps_per_second': 1.017, 'total_flos': 26742055157760.0, 'train_loss': 4.1112366771698})

In [3]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()

421

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.nn.parameter import Parameter
import torch.nn.init as init

class MemoryEfficientLinearFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight, bias):
        # We'll split along the 0th dimension.
        num_chunks = 2  # Adjust based on your memory constraints.
        input_chunks = input.chunk(num_chunks, dim=0)
        outputs = []
        for chunk in input_chunks:
            # chunk shape: (chunk_size, ..., in_features)
            orig_shape = chunk.shape[:-1]
            # Flatten all dimensions except the last (features) so that we get a matrix.
            chunk_flat = chunk.reshape(-1, chunk.shape[-1])
            # Do the matrix multiplication.
            out_chunk_flat = chunk_flat.matmul(weight.t())
            if bias is not None:
                # Add bias (broadcasted across the batch dimension).
                out_chunk_flat += bias
            # Reshape back to the original shape (except last dim becomes out_features).
            out_chunk = out_chunk_flat.view(*orig_shape, weight.shape[0])
            outputs.append(out_chunk)
        output = torch.cat(outputs, dim=0)
        ctx.save_for_backward(input, weight, bias)
        ctx.num_chunks = num_chunks
        return output

    @staticmethod
    def backward(ctx, grad_output):
        input, weight, bias = ctx.saved_tensors
        num_chunks = ctx.num_chunks

        # Initialize accumulators for gradients.
        grad_input_chunks = []
        grad_weight = torch.zeros_like(weight)
        grad_bias = torch.zeros_like(bias) if bias is not None else None

        # Split input and grad_output in the same way as in forward.
        input_chunks = input.chunk(num_chunks, dim=0)
        grad_output_chunks = grad_output.chunk(num_chunks, dim=0)
        for x_chunk, go_chunk in zip(input_chunks, grad_output_chunks):
            # x_chunk: (chunk_size, ..., in_features)
            # go_chunk: (chunk_size, ..., out_features)
            orig_shape = x_chunk.shape[:-1]
            x_chunk_flat = x_chunk.reshape(-1, x_chunk.shape[-1])
            go_chunk_flat = go_chunk.reshape(-1, weight.shape[0])
            
            # Gradient w.r.t. input.
            grad_input_chunk_flat = go_chunk_flat.matmul(weight)
            grad_input_chunk = grad_input_chunk_flat.view(*x_chunk.shape)
            grad_input_chunks.append(grad_input_chunk)
            
            # Gradient w.r.t. weight.
            grad_weight += go_chunk_flat.t().matmul(x_chunk_flat)
            
            # Gradient w.r.t. bias.
            if bias is not None:
                grad_bias += go_chunk_flat.sum(dim=0)
        
        grad_input = torch.cat(grad_input_chunks, dim=0)
        return grad_input, grad_weight, grad_bias

class MemoryEfficientLinear(nn.Module):
    __constants__ = ['in_features', 'out_features']
    in_features: int
    out_features: int

    def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None) -> None:
        super(MemoryEfficientLinear, self).__init__()
        factory_kwargs = {'device': device, 'dtype': dtype}
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
        if bias:
            self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self) -> None:
        # Kaiming uniform initialization.
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        # Use the custom memory-efficient function.
        return MemoryEfficientLinearFunction.apply(input, self.weight, self.bias)

    def extra_repr(self) -> str:
        return f'in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}'


nn.Linear = MemoryEfficientLinear

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model  # Import LoRA utilities

max_seq_length = 1024
model_name = "unsloth/Llama-3.2-1B"

# Load your model and tokenizer as before
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    attn_implementation="sdpa"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "right"

model.enable_input_require_grads()

# Load a portion of your dataset
url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
dataset = load_dataset("json", data_files={"train": url}, split="train[:10%]")

# Print each layer's name along with its trainable status.
for name, param in model.named_parameters():
    if "model.layers.15" not in name:
        param.requires_grad = False

# Optionally, summarize the total number of trainable parameters vs. total parameters.
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"\nTrainable parameters: {trainable_params} / Total parameters: {total_params}")


# Setup the trainer with the LoRA-applied model
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    processing_class=tokenizer,
    args=SFTConfig(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=2,
        warmup_steps=1,
        max_steps=25,
        logging_steps=1,
        output_dir="outputs",
        seed=3407,
        max_seq_length=max_seq_length,
        report_to="none",  # Disable reporting (e.g., for W&B)
        dataset_num_proc=4,
    ),
)

trainer.train()



Trainable parameters: 60821504 / Total parameters: 1235814400


Step,Training Loss
1,3.2304
2,4.999
3,5.0006
4,6.0348
5,4.0794
6,5.1938
7,3.433
8,2.872
9,4.2294
10,4.2066


TrainOutput(global_step=25, training_loss=4.111236553192139, metrics={'train_runtime': 31.4821, 'train_samples_per_second': 1.588, 'train_steps_per_second': 0.794, 'total_flos': 26742055157760.0, 'train_loss': 4.111236553192139})