In [1]:
from galore_torch import GaLoreAdamW

In [2]:
from datasets import load_dataset
from torchvision import transforms
from transformers import ViTForImageClassification, AutoConfig, TrainingArguments, Trainer
import torch
from PIL import Image
import numpy as np

In [3]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from transformers import ViTForImageClassification, AutoConfig, TrainingArguments, Trainer
import torch

In [4]:
dataset = load_dataset("cifar10")

In [5]:
transform = transforms.Compose([
    transforms.ToTensor(),                    
    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), 
])


train_dataset = datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)


from datasets import Dataset
def convert_to_hf_dataset(pytorch_dataset):
    
    images = [data[0].numpy() for data in pytorch_dataset]
    labels = [data[1] for data in pytorch_dataset]
    return Dataset.from_dict({"pixel_values": images, "labels": labels})

hf_train_dataset = convert_to_hf_dataset(train_dataset)
hf_test_dataset = convert_to_hf_dataset(test_dataset)

Files already downloaded and verified
Files already downloaded and verified


In [6]:
from transformers import ViTConfig, ViTForImageClassification
#from torchsummary import summary
import torch

config = ViTConfig(
    image_size=32,           
    patch_size=4,            
    num_channels=3,          
    hidden_size=384,         
    num_hidden_layers=7,     
    num_attention_heads=12,   
    intermediate_size=384,  
    num_labels=10           
)

In [7]:
model = ViTForImageClassification(config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 384, kernel_size=(4, 4), stride=(4, 4))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-6): 7 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=384, ou

In [8]:
training_args = TrainingArguments(
    output_dir="./results",           
    evaluation_strategy="epoch",           
    save_strategy="no",              
    learning_rate=5e-4,              
    per_device_train_batch_size=512,  
    per_device_eval_batch_size=512,   
    num_train_epochs=5,              
    weight_decay=0.01,               
    logging_dir="./logs",            
    logging_steps=10,                
    remove_unused_columns=True,      
    report_to="none",                

)

In [28]:
model_parameters = list(model.named_parameters())

galore_params = []
non_galore_params = []

for name, param in model_parameters:
    if ("attention" in name or "intermediate" in name) and param.ndim == 2:
        galore_params.append(param)
    else:
        non_galore_params.append(param)

param_groups = [
    {'params': non_galore_params},  
    {
        'params': galore_params,
        'rank': 64,  
        'update_proj_gap': 50,
        'scale': 0.10,
        'proj_type': 'std',  
    },
]

optimizer = GaLoreAdamW(param_groups, lr=5e-4)



In [29]:
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return {"accuracy": (predictions == eval_pred.label_ids).mean()}

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train_dataset,
    eval_dataset=hf_test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),
    callbacks=[InlineProfilerCallback()],  
)


In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.8021,1.76153,0.3605
2,1.5819,1.568274,0.4197
3,1.4886,1.487626,0.4484
4,1.3774,1.379276,0.4954
5,1.3209,1.321116,0.5217
6,1.2278,1.26986,0.5367
7,1.2116,1.237881,0.5522


Epoch 1 completed in 151.27 seconds
Peak GPU memory usage: 4.646 GB
Current GPU memory usage: 0.174 GB
Cached GPU memory: 5.088 GB
Epoch 2 completed in 149.92 seconds
Peak GPU memory usage: 4.646 GB
Current GPU memory usage: 0.174 GB
Cached GPU memory: 5.088 GB
Epoch 3 completed in 149.25 seconds
Peak GPU memory usage: 4.646 GB
Current GPU memory usage: 0.174 GB
Cached GPU memory: 5.088 GB
Epoch 4 completed in 150.11 seconds
Peak GPU memory usage: 4.646 GB
Current GPU memory usage: 0.174 GB
Cached GPU memory: 5.088 GB
Epoch 5 completed in 150.69 seconds
Peak GPU memory usage: 4.646 GB
Current GPU memory usage: 0.174 GB
Cached GPU memory: 5.088 GB
Epoch 6 completed in 151.10 seconds
Peak GPU memory usage: 4.646 GB
Current GPU memory usage: 0.174 GB
Cached GPU memory: 5.088 GB
Epoch 7 completed in 149.86 seconds
Peak GPU memory usage: 4.646 GB
Current GPU memory usage: 0.174 GB
Cached GPU memory: 5.088 GB


TrainOutput(global_step=686, training_loss=1.473583958239319, metrics={'train_runtime': 1216.3378, 'train_samples_per_second': 287.749, 'train_steps_per_second': 0.564, 'total_flos': 4.0441347072e+16, 'train_loss': 1.473583958239319, 'epoch': 7.0})

In [9]:
import torch
import time
from transformers import TrainerCallback

class InlineProfilerCallback(TrainerCallback):
    def __init__(self):
        self.epoch_start_time = None

    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start_time = time.time()  
        torch.cuda.reset_peak_memory_stats()  

    def on_epoch_end(self, args, state, control, **kwargs):
        epoch_time = time.time() - self.epoch_start_time  
        peak_memory = torch.cuda.max_memory_allocated() / 1e9  
        current_memory = torch.cuda.memory_allocated() / 1e9   
        cached_memory = torch.cuda.memory_reserved() / 1e9    

        current_epoch = int(state.epoch)
        print(f"Epoch {current_epoch} completed in {epoch_time:.2f} seconds")
        print(f"Peak GPU memory usage: {peak_memory:.3f} GB")
        print(f"Current GPU memory usage: {current_memory:.3f} GB")
        print(f"Cached GPU memory: {cached_memory:.3f} GB")

# randomized svd

In [10]:
model_parameters = list(model.named_parameters())


galore_params = []
non_galore_params = []

for name, param in model_parameters:
    if ("attention" in name or "intermediate" in name) and param.ndim == 2:
        galore_params.append(param)
    else:
        non_galore_params.append(param)

param_groups = [
    {
        'params': non_galore_params
    },
    {
        'params': galore_params,
        'rank': 64,
        'update_proj_gap': 100,
        'scale': 0.25,
        'proj_type': 'std',
        'use_randomized_svd': True  # Enable randomized SVD
    }
]

optimizer = GaLoreAdamW(param_groups, lr=5e-4)



In [11]:
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return {"accuracy": (predictions == eval_pred.label_ids).mean()}

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train_dataset,
    eval_dataset=hf_test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),
    callbacks=[InlineProfilerCallback()],  
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.779,1.714048,0.3699
2,1.5101,1.495061,0.451
3,1.3603,1.380858,0.4989
4,1.2776,1.293311,0.5323
5,1.2196,1.222517,0.556
6,1.1259,1.173395,0.5798
7,1.1023,1.148384,0.5931


Epoch 1 completed in 151.40 seconds
Peak GPU memory usage: 4.538 GB
Current GPU memory usage: 0.066 GB
Cached GPU memory: 4.918 GB
Epoch 2 completed in 148.79 seconds
Peak GPU memory usage: 4.538 GB
Current GPU memory usage: 0.066 GB
Cached GPU memory: 4.918 GB
Epoch 3 completed in 148.07 seconds
Peak GPU memory usage: 4.538 GB
Current GPU memory usage: 0.066 GB
Cached GPU memory: 4.918 GB
Epoch 4 completed in 148.45 seconds
Peak GPU memory usage: 4.538 GB
Current GPU memory usage: 0.066 GB
Cached GPU memory: 4.918 GB
Epoch 5 completed in 148.76 seconds
Peak GPU memory usage: 4.538 GB
Current GPU memory usage: 0.066 GB
Cached GPU memory: 4.920 GB
Epoch 6 completed in 147.76 seconds
Peak GPU memory usage: 4.538 GB
Current GPU memory usage: 0.066 GB
Cached GPU memory: 4.920 GB
Epoch 7 completed in 148.43 seconds
Peak GPU memory usage: 4.538 GB
Current GPU memory usage: 0.066 GB
Cached GPU memory: 4.920 GB


TrainOutput(global_step=686, training_loss=1.3875788662245947, metrics={'train_runtime': 1206.6163, 'train_samples_per_second': 290.067, 'train_steps_per_second': 0.569, 'total_flos': 4.0441347072e+16, 'train_loss': 1.3875788662245947, 'epoch': 7.0})

# normal svd

In [18]:
model_parameters = list(model.named_parameters())


galore_params = []
non_galore_params = []

for name, param in model_parameters:
    if ("attention" in name or "intermediate" in name) and param.ndim == 2:
        galore_params.append(param)
    else:
        non_galore_params.append(param)

param_groups = [
    {
        'params': non_galore_params
    },
    {
        'params': galore_params,
        'rank': 64,
        'update_proj_gap': 100,
        'scale': 0.25,
        'proj_type': 'std',
        'use_randomized_svd': False  # Enable randomized SVD
    }
]

optimizer = GaLoreAdamW(param_groups, lr=5e-4)



In [19]:
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return {"accuracy": (predictions == eval_pred.label_ids).mean()}

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train_dataset,
    eval_dataset=hf_test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),
    callbacks=[InlineProfilerCallback()],  
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.7722,1.70623,0.365
2,1.5207,1.527251,0.4402
3,1.3678,1.363507,0.5044
4,1.2789,1.283744,0.5275
5,1.2136,1.245465,0.545
6,1.138,1.201433,0.5627
7,1.1137,1.166211,0.5784


Epoch 1 completed in 149.34 seconds
Peak GPU memory usage: 4.605 GB
Current GPU memory usage: 0.133 GB
Cached GPU memory: 5.050 GB
Epoch 2 completed in 149.81 seconds
Peak GPU memory usage: 4.605 GB
Current GPU memory usage: 0.133 GB
Cached GPU memory: 5.050 GB
Epoch 3 completed in 149.17 seconds
Peak GPU memory usage: 4.605 GB
Current GPU memory usage: 0.133 GB
Cached GPU memory: 5.050 GB
Epoch 4 completed in 149.21 seconds
Peak GPU memory usage: 4.605 GB
Current GPU memory usage: 0.133 GB
Cached GPU memory: 5.050 GB
Epoch 5 completed in 149.28 seconds
Peak GPU memory usage: 4.605 GB
Current GPU memory usage: 0.133 GB
Cached GPU memory: 5.050 GB
Epoch 6 completed in 149.41 seconds
Peak GPU memory usage: 4.605 GB
Current GPU memory usage: 0.133 GB
Cached GPU memory: 5.050 GB
Epoch 7 completed in 149.23 seconds
Peak GPU memory usage: 4.605 GB
Current GPU memory usage: 0.133 GB
Cached GPU memory: 5.050 GB


TrainOutput(global_step=686, training_loss=1.3912460901299302, metrics={'train_runtime': 1210.6872, 'train_samples_per_second': 289.092, 'train_steps_per_second': 0.567, 'total_flos': 4.0441347072e+16, 'train_loss': 1.3912460901299302, 'epoch': 7.0})

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

In [14]:
model = ViTForImageClassification(config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 384, kernel_size=(4, 4), stride=(4, 4))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-6): 7 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=384, ou

In [15]:
training_args = TrainingArguments(
    output_dir="./results",           
    evaluation_strategy="epoch",           
    save_strategy="no",              
    learning_rate=5e-4,              
    per_device_train_batch_size=512,  
    per_device_eval_batch_size=512,   
    num_train_epochs=5,              
    weight_decay=0.01,               
    logging_dir="./logs",            
    logging_steps=10,                
    remove_unused_columns=True,      
    report_to="none",                

)

# rSVD

In [10]:
model_parameters = list(model.named_parameters())

galore_params = []
non_galore_params = []

for name, param in model_parameters:
    if ("attention" in name or "intermediate" in name) and param.ndim == 2:
        galore_params.append(param)
    else:
        non_galore_params.append(param)

param_groups = [
    {
        'params': non_galore_params
    },
    {
        'params': galore_params,
        'rank': 128,
        'update_proj_gap': 10,
        'scale': 0.25,
        'proj_type': 'std',
        'use_randomized_svd': True  # Enable randomized SVD
    }
]

optimizer = GaLoreAdamW(param_groups, lr=5e-4)



In [11]:
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return {"accuracy": (predictions == eval_pred.label_ids).mean()}

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train_dataset,
    eval_dataset=hf_test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),
    callbacks=[InlineProfilerCallback()],  
)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.7722,1.693816,0.3774
2,1.5344,1.539204,0.4313
3,1.4128,1.417106,0.4854
4,1.3261,1.333881,0.5142
5,1.2851,1.294585,0.5308


Epoch 1 completed in 150.15 seconds
Peak GPU memory usage: 4.548 GB
Current GPU memory usage: 0.076 GB
Cached GPU memory: 4.933 GB
Epoch 2 completed in 148.51 seconds
Peak GPU memory usage: 4.548 GB
Current GPU memory usage: 0.076 GB
Cached GPU memory: 4.935 GB
Epoch 3 completed in 147.00 seconds
Peak GPU memory usage: 4.548 GB
Current GPU memory usage: 0.076 GB
Cached GPU memory: 4.935 GB
Epoch 4 completed in 147.57 seconds
Peak GPU memory usage: 4.548 GB
Current GPU memory usage: 0.076 GB
Cached GPU memory: 4.935 GB
Epoch 5 completed in 146.99 seconds
Peak GPU memory usage: 4.548 GB
Current GPU memory usage: 0.076 GB
Cached GPU memory: 4.935 GB


TrainOutput(global_step=490, training_loss=1.5264379326178104, metrics={'train_runtime': 853.6893, 'train_samples_per_second': 292.847, 'train_steps_per_second': 0.574, 'total_flos': 2.888667648e+16, 'train_loss': 1.5264379326178104, 'epoch': 5.0})

# normal

In [16]:
model_parameters = list(model.named_parameters())

galore_params = []
non_galore_params = []

for name, param in model_parameters:
    if ("attention" in name or "intermediate" in name) and param.ndim == 2:
        galore_params.append(param)
    else:
        non_galore_params.append(param)

param_groups = [
    {
        'params': non_galore_params
    },
    {
        'params': galore_params,
        'rank': 128,
        'update_proj_gap': 10,
        'scale': 0.25,
        'proj_type': 'std',
        'use_randomized_svd': False  # Enable randomized SVD
    }
]

optimizer = GaLoreAdamW(param_groups, lr=5e-4)



In [17]:
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return {"accuracy": (predictions == eval_pred.label_ids).mean()}

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train_dataset,
    eval_dataset=hf_test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),
    callbacks=[InlineProfilerCallback()],  
)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.7724,1.731964,0.3751
2,1.5494,1.557702,0.4255
3,1.4423,1.430607,0.4781
4,1.3399,1.345578,0.5139
5,1.2954,1.300729,0.529


Epoch 1 completed in 153.92 seconds
Peak GPU memory usage: 4.586 GB
Current GPU memory usage: 0.115 GB
Cached GPU memory: 5.023 GB
Epoch 2 completed in 153.60 seconds
Peak GPU memory usage: 4.586 GB
Current GPU memory usage: 0.115 GB
Cached GPU memory: 5.023 GB
Epoch 3 completed in 153.82 seconds
Peak GPU memory usage: 4.586 GB
Current GPU memory usage: 0.115 GB
Cached GPU memory: 5.023 GB
Epoch 4 completed in 156.22 seconds
Peak GPU memory usage: 4.586 GB
Current GPU memory usage: 0.115 GB
Cached GPU memory: 5.023 GB
Epoch 5 completed in 156.83 seconds
Peak GPU memory usage: 4.586 GB
Current GPU memory usage: 0.115 GB
Cached GPU memory: 5.023 GB


TrainOutput(global_step=490, training_loss=1.5406883122969648, metrics={'train_runtime': 888.7109, 'train_samples_per_second': 281.306, 'train_steps_per_second': 0.551, 'total_flos': 2.888667648e+16, 'train_loss': 1.5406883122969648, 'epoch': 5.0})