In [1]:
import os
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch
import numpy as np
from tqdm import tqdm

In [2]:
# Set your Hugging Face access token
os.environ['HF_TOKEN'] = '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'

device = torch.device('cuda:0')

# Load the model and tokenizer with memory efficient settings
model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Create config with memory optimizations
config = AutoConfig.from_pretrained(model_id, use_auth_token=os.environ['HF_TOKEN'])
config.use_cache = False

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.environ['HF_TOKEN'])
tokenizer.pad_token = tokenizer.eos_token

# Initialize model with memory optimizations
model = AutoModel.from_pretrained(
    model_id,
    config=config,
    torch_dtype=torch.bfloat16,
    use_auth_token=os.environ['HF_TOKEN'],
    low_cpu_mem_usage=True
).to(device)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# Configure numpy settings
np.set_printoptions(suppress=True, precision=8, threshold=np.inf, linewidth=np.inf)

def process_chunk(text, chunk_size=512):
    """Process text with chunking for memory efficiency"""
    inputs = tokenizer(
        text, 
        return_tensors="pt",
        truncation=True,
        max_length=chunk_size,
        padding=True
    )
    
    # Move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad(), torch.cuda.amp.autocast():  # Use automatic mixed precision
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1)
        
    # Convert to numpy and clean up
    embedding = embedding.to(torch.float32).cpu().numpy()
    del outputs
    torch.cuda.empty_cache()
    
    return embedding

# Read and process the input file
with open('../../prompts/new_prompts/raw_listing_new.txt', 'r') as file:
    content = file.read()
prompts = content.split('----------------------------')
prompts = [prompt.strip() for prompt in prompts if prompt.strip()]


In [5]:
# Process prompts and save embeddings
with open('raw_listing_embeddings_new.txt', 'w') as out_file:
    for i, prompt in enumerate(tqdm(prompts, desc="Processing prompts")):
        try:
            # Clear cache before processing each prompt
            torch.cuda.empty_cache()
            
            # Get embedding
            embedding = process_chunk(prompt)
            
            # Save to file
            embedding_str = ','.join(map(str, embedding.flatten()))
            out_file.write(f"{embedding_str}\n--------------------------------------------\n")
            out_file.flush()
            
        except RuntimeError as e:
            if "out of memory" in str(e):
                # If OOM occurs, try with smaller chunk size
                torch.cuda.empty_cache()
                try:
                    embedding = process_chunk(prompt, chunk_size=256)
                    embedding_str = ','.join(map(str, embedding.flatten()))
                    out_file.write(f"{embedding_str}\n--------------------------------------------\n")
                    out_file.flush()
                except Exception as e2:
                    print(f"Failed to process prompt {i} even with reduced chunk size: {str(e2)}")
            else:
                print(f"Error processing prompt {i}: {str(e)}")
        except Exception as e:
            print(f"Error processing prompt {i}: {str(e)}")
            continue
            
        # Clear cache every few iterations
        if i % 5 == 0:
            torch.cuda.empty_cache()

  with torch.no_grad(), torch.cuda.amp.autocast():  # Use automatic mixed precision
Processing prompts: 100%|██████████| 37096/37096 [55:55<00:00, 11.06it/s]  
