In [1]:
import os
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from tqdm import tqdm

# Set your Hugging Face access token
os.environ['HF_TOKEN'] = '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'

device = torch.device('cuda:3')

# Load the model and tokenizer
model_id = "google/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.environ['HF_TOKEN'])
model = AutoModel.from_pretrained(
    model_id, 
    torch_dtype=torch.bfloat16, 
    use_auth_token=os.environ['HF_TOKEN'],
    low_cpu_mem_usage=True
).to(device)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
# Read and process the input file
with open('../../prompts/new_prompts/refined_prompts_new.txt', 'r') as file:
    content = file.read()

# Debug print for content
print(f"Total content length: {len(content)}")

prompts = content.split('--------------------------------------------------')
prompts = [prompt.strip() for prompt in prompts if prompt.strip()]

# Debug print for prompts
print(f"Number of prompts after splitting: {len(prompts)}")
print(f"First prompt length: {len(prompts[0]) if prompts else 0}")

# Configure numpy settings
np.set_printoptions(suppress=True, precision=8, threshold=np.inf, linewidth=np.inf)

# Count lines in output file before processing
if os.path.exists('refined_embeddings.txt'):
    with open('refined_embeddings.txt', 'r') as f:
        existing_lines = sum(1 for _ in f)
    print(f"Existing lines in output file: {existing_lines}")

Total content length: 210128440
Number of prompts after splitting: 37096
First prompt length: 3297
Existing lines in output file: 74250


In [3]:
# Open the output file in write mode
with open('refined_embeddings_new.txt', 'w') as out_file:
    embedding_count = 0
    # Create progress bar
    for i, prompt in enumerate(tqdm(prompts, desc="Processing prompts")):
        # Move input tensors to the same device as the model
        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Process one embedding at a time
        embedding = outputs.last_hidden_state.mean(dim=1).to(torch.float32).cpu().numpy()
        
        # Convert to string without truncation and format as a single line
        embedding_str = ','.join(map(str, embedding.flatten()))
        out_file.write(f"Prompt_{i}: {embedding_str}\n--------------------------------------------\n")
        embedding_count += 1
        
        # Flush the file buffer periodically
        if i % 10 == 0:
            out_file.flush()
            
        # Clear CUDA cache periodically
        if i % 100 == 0:
            torch.cuda.empty_cache()


Processing prompts: 100%|██████████| 37096/37096 [1:47:22<00:00,  5.76it/s]  


Final lines in output file: 74250
Total embeddings processed: 37096
Expected total lines: 74192


In [6]:
# Count lines in output file after processing
with open('refined_embeddings_new.txt', 'r') as f:
    final_lines = sum(1 for _ in f)
print(f"Final lines in output file: {final_lines}")
print(f"Total embeddings processed: {embedding_count}")
print(f"Expected total lines: {embedding_count * 2}")  # Each embedding has 2 lines (embedding + separator)

Final lines in output file: 74192
Total embeddings processed: 37096
Expected total lines: 74192
