In [1]:
import sys 
import torch
import random
import numpy as np
import pandas as pd
import gc
import time
import random
from tqdm import tqdm

from IPython.display import display
from accelerate import infer_auto_device_map, dispatch_model
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoModel, LlamaTokenizer

if (not torch.cuda.is_available()): print("Sorry - GPU required!")
    
import logging
logging.getLogger('transformers').setLevel(logging.ERROR)

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

test_df = pd.read_csv("test.csv")
test_df

Unnamed: 0,id,topic
0,1097671,Compare and contrast the importance of self-reliance and adaptability in healthcare.
1,1726150,Evaluate the effectiveness of management consulting in addressing conflicts within marketing.
2,3211968,Discuss the role of self-reliance in achieving success in software engineering.


In [None]:
# Clear GPU memory and delete existing objects if they exist
if torch.cuda.is_available():
    torch.cuda.empty_cache()
for obj in ['model', 'pipe', 'tokenizer']:
    if obj in globals():
        del globals()[obj]

# Model configuration
model_name = "meta-llama/Llama-3.3-70B-Instruct"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model configuration (you need this for custom `device_map`)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,  # Use bfloat16 to save memory
    trust_remote_code=True,
)

# Infer device map (set layers to offload to disk if they don't fit in memory)
device_map = infer_auto_device_map(
    model,
    max_memory={"cuda": "3GiB", "cpu": "2.0GiB"},  # Adjust according to your hardware
    no_split_module_classes=["LlamaDecoderLayer"],  # Update if using a different model
)

# Dispatch the model with disk_offload
dispatch_model(
    model,
    device_map=device_map,
    offload_dir="offload",  # Directory for offloaded layers
)