# Test GPU requirements
* Teest GPU requirements by downloading Qwen2.5-1.5B-Instruct model with unsloth
* Create appropriate number of instances of the model


In [1]:
import os
from transformers import utils

os.environ['HF_HOME'] = '/mnt/s/hf'
os.environ['TRANSFORMERS_CACHE'] = '/mnt/s/hf/models'
os.environ['HF_DATASETS_CACHE'] = '/mnt/s/hf/datasets'
os.environ['HUGGINGFACE_HUB_CACHE'] = '/mnt/s/hf/hub'

utils.TRANSFORMERS_CACHE = '/mnt/s/hf/models'

# Create directories if they don't exist
os.makedirs('/mnt/s/hf/models', exist_ok=True)
os.makedirs('/mnt/s/hf/datasets', exist_ok=True)
os.makedirs('/mnt/s/hf/hub', exist_ok=True)

# Verify the settings
print("HF_HOME:", os.getenv('HF_HOME'))
print("TRANSFORMERS_CACHE:", os.getenv('TRANSFORMERS_CACHE'))
print("Actual cache being used:", utils.TRANSFORMERS_CACHE)
print("HF Hub cache:", os.getenv('HUGGINGFACE_HUB_CACHE'))


  from .autonotebook import tqdm as notebook_tqdm


HF_HOME: /mnt/s/hf
TRANSFORMERS_CACHE: /mnt/s/hf/models
Actual cache being used: /mnt/s/hf/models
HF Hub cache: /mnt/s/hf/hub


In [2]:
import os
import torch
import copy
from unsloth import FastLanguageModel

MODEL_NAME = "unsloth/Qwen2.5-1.5B-Instruct"  # Adjust this name if needed.
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True      # Set True to load in 4-bit (as often used with unsloth).
OUTPUT_DIR = "./resources"  # Directory where model copies will be saved.

# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("Output directory ready:", OUTPUT_DIR)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Output directory ready: ./resources


In [3]:
device_id = 0  # or use torch.cuda.current_device()
props = torch.cuda.get_device_properties(device_id)
total_memory = props.total_memory  # in bytes
reserved_memory = torch.cuda.memory_reserved(device_id)
allocated_memory = torch.cuda.memory_allocated(device_id)
free_memory = total_memory - reserved_memory

print("Total GPU memory (GB): {:.2f}".format(total_memory / 1024**3))
print("Reserved GPU memory (GB): {:.2f}".format(reserved_memory / 1024**3))
print("Allocated GPU memory (GB): {:.2f}".format(allocated_memory / 1024**3))
print("Free GPU memory (GB): {:.2f}".format(free_memory / 1024**3))

Total GPU memory (GB): 11.99
Reserved GPU memory (GB): 0.00
Allocated GPU memory (GB): 0.00
Free GPU memory (GB): 11.99


In [4]:
print("Downloading QWEN2.5-1.5B via Unsloth...")

# Download the model using Unsloth’s FastLanguageModel API.
# The API automatically handles special tokenizer and model configuration.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=torch.bfloat16,             # Use default dtype detection (or set "bfloat16" if supported)
    load_in_4bit=LOAD_IN_4BIT
)
print("Model and tokenizer downloaded successfully!")


Downloading QWEN2.5-1.5B via Unsloth...
==((====))==  Unsloth 2025.1.8: Fast Qwen2 patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA GeForce RTX 4070 Ti. Max memory: 11.994 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model and tokenizer downloaded successfully!


In [5]:

# # Example: Patch the model with LoRA adapters for further tuning.
# # This step is optional and can be modified as required.
# model = FastLanguageModel.get_peft_model(
#     model,
#     r=16,  # LoRA rank parameter (choose according to your experimentation)
#     target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
#     lora_alpha=16,
#     lora_dropout=0,
#     bias="none",
#     use_gradient_checkpointing="unsloth",  # Optimizes memory usage
#     random_state=3407,
#     use_rslora=False,
#     loftq_config=None,
# )
# print("LoRA patching completed (if applied).")


In [6]:

# # Example: Patch the model with LoRA adapters for further tuning.
# # This step is optional and can be modified as required.
# model = FastLanguageModel.get_peft_model(
#     model,
#     r=16,  # LoRA rank parameter (choose according to your experimentation)
#     target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
#     lora_alpha=16,
#     lora_dropout=0,
#     bias="none",
#     use_gradient_checkpointing="unsloth",  # Optimizes memory usage
#     random_state=3407,
#     use_rslora=False,
#     loftq_config=None,
# )
# print("LoRA patching completed (if applied).")


In [7]:

# # Example: Patch the model with LoRA adapters for further tuning.
# # This step is optional and can be modified as required.
# model = FastLanguageModel.get_peft_model(
#     model,
#     r=16,  # LoRA rank parameter (choose according to your experimentation)
#     target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
#     lora_alpha=16,
#     lora_dropout=0,
#     bias="none",
#     use_gradient_checkpointing="unsloth",  # Optimizes memory usage
#     random_state=3407,
#     use_rslora=False,
#     loftq_config=None,
# )
# print("LoRA patching completed (if applied).")


In [None]:
def measure_model_gpu_usage(model, device=0):
    # Clear cache to improve measurement consistency (optional)
    torch.cuda.empty_cache()
    # Synchronize to ensure that previous operations are complete.
    torch.cuda.synchronize(device)
    memory_before = torch.cuda.memory_allocated(device)

    # If you want to create an independent copy of the model,
    # you can use deepcopy (be careful if the model has CUDA tensors already)
    model_copy = copy.deepcopy(model).to('cuda')
    torch.cuda.synchronize(device)
    memory_after = torch.cuda.memory_allocated(device)

    usage_bytes = memory_after - memory_before
    usage_gb = usage_bytes / (1024 ** 3)
    print(f"Memory added by the new model instance: {usage_gb:.2f} GB")

    return model_copy

# Suppose `model` is already defined (for example, loaded from FastLanguageModel.from_pretrained)
model_instances = []
num_instances = 3

for i in range(num_instances):
    print(f"\nMeasuring model instance {i + 1}:")
    instance = measure_model_gpu_usage(model, device=0)
    model_instances.append(instance)


Measuring model instance 1:
Memory added by the new model instance: 1.10 GB

Measuring model instance 2:
Memory added by the new model instance: 1.10 GB

Measuring model instance 3:
Memory added by the new model instance: 1.10 GB


In [11]:
# After creating all model instances, measure the current memory usage
device_id = 0
props = torch.cuda.get_device_properties(device_id)
total_memory = props.total_memory
reserved_memory = torch.cuda.memory_reserved(device_id)
allocated_memory = torch.cuda.memory_allocated(device_id)
free_memory = total_memory - reserved_memory

print("Total GPU memory (GB): {:.2f}".format(total_memory / 1024**3))
print("Reserved GPU memory (GB): {:.2f}".format(reserved_memory / 1024**3))
print("Allocated GPU memory (GB): {:.2f}".format(allocated_memory / 1024**3))
print("Free GPU memory (GB): {:.2f}".format(free_memory / 1024**3))

Total GPU memory (GB): 11.99
Reserved GPU memory (GB): 4.40
Allocated GPU memory (GB): 4.39
Free GPU memory (GB): 7.59
