In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline as hf_pipeline
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from diffusers import StableDiffusionPipeline, ShapEPipeline
import torch
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def log_vram_usage(stage):
    allocated = torch.cuda.memory_allocated()/(1024 ** 3)  
    reserved = torch.cuda.memory_reserved()/(1024 ** 3)  
    print(f"[{stage}] VRAM Usage - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB")

def log_peak_vram_usage(stage):
    peak = torch.cuda.max_memory_allocated() / 1024**3
    print(f"\n[Peak {stage} usage] VRAM usage - Peak: {peak:.2f} GB")

def clear_memory():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.ipc_collect()  
    gc.collect()  

In [3]:
shape_prompts = [
    "wooden chair with curved legs",
    "futuristic drone",
    "fantasy sword",
    "toy robot",
    "detailed coffee mug",
    "space helmet",
    "medieval lantern",
    "pirate ship cannon",
    "mountain bike"
]

sd_prompts = [
    "futuristic city at sunset",
    "medieval castle on a mountain",
    "robot in a field of flowers",
    "cyberpunk street at night",
    "astronaut relaxing on the moon",
    "mystical forest with glowing trees",
    "dragon flying over a volcano",
    "fantasy village beside a waterfall",
    "spaceship interior with glowing panels"
]

Note: run each one individully and then restart

In [4]:
log_vram_usage("Before loading Shap-E")
shape_pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16, variant="fp16")
shape_pipe = shape_pipe.to("cuda")
log_vram_usage("After loading Shap-E")

for i, prompt in enumerate(shape_prompts, 1):
    print(f"\nGenerating 3D object {i}/10: {prompt}")
    _ = shape_pipe(prompt, guidance_scale=30, num_inference_steps=64, frame_size=256, output_type="mesh").images[0]

log_vram_usage("After generating with Shap-E")
log_peak_vram_usage("Shap-E")

del shape_pipe
clear_memory()

[Before loading Shap-E] VRAM Usage - Allocated: 0.00 GB, Reserved: 0.00 GB


shap_e_renderer\diffusion_pytorch_model.safetensors not found
The config attributes {'renderer': ['shap_e', 'ShapERenderer']} were passed to ShapEPipeline, but are not expected and will be ignored. Please verify your model_index.json configuration file.
Keyword arguments {'renderer': ['shap_e', 'ShapERenderer']} are not expected by ShapEPipeline and will be ignored.
  return self.fget.__get__(instance, owner)()
Loading pipeline components...: 100%|██████████| 5/5 [00:01<00:00,  3.28it/s]


[After loading Shap-E] VRAM Usage - Allocated: 1.25 GB, Reserved: 1.26 GB

Generating 3D object 1/10: wooden chair with curved legs


100%|██████████| 127/127 [00:07<00:00, 18.06it/s]



Generating 3D object 2/10: futuristic drone


100%|██████████| 127/127 [00:06<00:00, 19.01it/s]



Generating 3D object 3/10: fantasy sword


100%|██████████| 127/127 [00:06<00:00, 19.41it/s]



Generating 3D object 4/10: toy robot


100%|██████████| 127/127 [00:06<00:00, 19.37it/s]



Generating 3D object 5/10: detailed coffee mug


100%|██████████| 127/127 [00:06<00:00, 19.20it/s]



Generating 3D object 6/10: space helmet


100%|██████████| 127/127 [00:06<00:00, 19.06it/s]



Generating 3D object 7/10: medieval lantern


100%|██████████| 127/127 [00:07<00:00, 17.78it/s]



Generating 3D object 8/10: pirate ship cannon


100%|██████████| 127/127 [00:07<00:00, 17.31it/s]



Generating 3D object 9/10: mountain bike


100%|██████████| 127/127 [00:06<00:00, 19.19it/s]


[After generating with Shap-E] VRAM Usage - Allocated: 1.26 GB, Reserved: 2.49 GB

[Peak Shap-E usage] VRAM usage - Peak: 2.22 GB


In [5]:
log_vram_usage("Before loading Stable Diffusion")
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16)
sd_pipe = sd_pipe.to("cuda")
log_vram_usage("After loading Stable Diffusion")

for i, prompt in enumerate(sd_prompts, 1):
    print(f"\nGenerating image {i}/10: {prompt}")
    with torch.autocast("cuda"):
        _ = sd_pipe(prompt).images[0]

log_vram_usage("After generating with Stable Diffusion")
log_peak_vram_usage("Stable Diffusion")

del sd_pipe
clear_memory()

[Before loading Stable Diffusion] VRAM Usage - Allocated: 0.01 GB, Reserved: 0.02 GB


safety_checker\model.safetensors not found
Loading pipeline components...: 100%|██████████| 7/7 [00:01<00:00,  4.54it/s]


[After loading Stable Diffusion] VRAM Usage - Allocated: 2.58 GB, Reserved: 2.64 GB

Generating image 1/10: futuristic city at sunset


100%|██████████| 50/50 [00:05<00:00,  9.64it/s]



Generating image 2/10: medieval castle on a mountain


100%|██████████| 50/50 [00:04<00:00, 10.42it/s]



Generating image 3/10: robot in a field of flowers


100%|██████████| 50/50 [00:04<00:00, 10.71it/s]



Generating image 4/10: cyberpunk street at night


100%|██████████| 50/50 [00:04<00:00, 10.71it/s]



Generating image 5/10: astronaut relaxing on the moon


100%|██████████| 50/50 [00:04<00:00, 10.73it/s]



Generating image 6/10: mystical forest with glowing trees


100%|██████████| 50/50 [00:04<00:00, 10.75it/s]



Generating image 7/10: dragon flying over a volcano


100%|██████████| 50/50 [00:04<00:00, 10.56it/s]



Generating image 8/10: fantasy village beside a waterfall


100%|██████████| 50/50 [00:04<00:00, 10.66it/s]



Generating image 9/10: spaceship interior with glowing panels


100%|██████████| 50/50 [00:04<00:00, 10.69it/s]


[After generating with Stable Diffusion] VRAM Usage - Allocated: 2.58 GB, Reserved: 3.56 GB

[Peak Stable Diffusion usage] VRAM usage - Peak: 3.34 GB


In [6]:
log_vram_usage("Before loading TinyLlama")

model_name_or_path = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path, device_map="cuda", trust_remote_code=True, revision="main")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

log_vram_usage("After loading TinyLlama") 

del model
del tokenizer
clear_memory() # doesn't clear VRAM for LLMs 

[Before loading TinyLlama] VRAM Usage - Allocated: 0.01 GB, Reserved: 0.02 GB


CUDA extension not installed.
CUDA extension not installed.
The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class


[After loading TinyLlama] VRAM Usage - Allocated: 0.74 GB, Reserved: 0.78 GB


In [7]:
log_vram_usage("Before loading Llama 2")

model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path, device_map="cuda", trust_remote_code=True, revision="main")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

log_vram_usage("After loading Llama 2") 

del model
del tokenizer
clear_memory()

[Before loading Llama 2] VRAM Usage - Allocated: 0.74 GB, Reserved: 0.78 GB
[After loading Llama 2] VRAM Usage - Allocated: 4.48 GB, Reserved: 4.77 GB
