In [2]:
!pip install torch

Collecting torch
  Downloading torch-2.7.1-cp39-cp39-win_amd64.whl (216.0 MB)
Collecting sympy>=1.13.3
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
Collecting fsspec
  Downloading fsspec-2025.7.0-py3-none-any.whl (199 kB)
Collecting mpmath<1.4,>=1.1.0
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
Installing collected packages: mpmath, sympy, fsspec, torch
Successfully installed fsspec-2025.7.0 mpmath-1.3.0 sympy-1.14.0 torch-2.7.1


In [5]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl (11.2 MB)
Collecting tokenizers<0.22,>=0.21
  Downloading tokenizers-0.21.4-cp39-abi3-win_amd64.whl (2.5 MB)
Collecting safetensors>=0.4.3
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl (308 kB)
Collecting huggingface-hub<1.0,>=0.34.0
  Downloading huggingface_hub-0.34.3-py3-none-any.whl (558 kB)
Installing collected packages: huggingface-hub, tokenizers, safetensors, transformers
Successfully installed huggingface-hub-0.34.3 safetensors-0.5.3 tokenizers-0.21.4 transformers-4.54.1


In [1]:
import torch

In [2]:
import torch
print(torch.cuda.is_available())

True


In [3]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [4]:
from transformers import BertTokenizer

In [5]:

import json

# Load JSON data
with open('data_for_vectordb.json', 'r') as file:
    data = json.load(file)

# Extract descriptions and CVE IDs
descriptions = [item['description'] for item in data]
cve_ids = [item['id'] for item in data]

In [6]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:

import time
start = time.time()
encoded_inputs = tokenizer(descriptions, padding=True, truncation=True, return_tensors='pt')
print(f"Done in {time.time() - start:.2f} seconds.")


Done in 565.49 seconds.


In [8]:
token_ids = encoded_inputs['input_ids']
attention_masks = encoded_inputs['attention_mask']

In [9]:
from transformers import AutoModel
import torch

model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = AutoModel.from_pretrained(model_name)

In [10]:
def embed_tokens(input_ids, attention_mask):
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    # Use the [CLS] token representation for sentence-level embedding
    embeddings = outputs.last_hidden_state[:, 0, :]  # Shape: [batch_size, hidden_dim]
    return embeddings

In [11]:

import time
start = time.time()
# Step 1: Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 2: Move model to GPU
model = model.to(device)

# Step 3: Embedding function (no changes needed here)
def embed_tokens(input_ids, attention_mask):
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    # Use the [CLS] token representation for sentence-level embedding
    embeddings = outputs.last_hidden_state[:, 0, :]  # Shape: [batch_size, hidden_dim]
    return embeddings

# Step 4: Tokenized inputs (already tensors)
# Assume token_ids and attention_masks are from tokenizer(..., return_tensors='pt')
batch_size = 16
all_embeddings = []

# Step 5: Run batch-wise embedding on GPU
for i in range(0, len(token_ids), batch_size):
    batch_input_ids = token_ids[i:i+batch_size].to(device)
    batch_attention_mask = attention_masks[i:i+batch_size].to(device)

    batch_embeddings = embed_tokens(batch_input_ids, batch_attention_mask)
    all_embeddings.append(batch_embeddings.cpu())  # Save to CPU to avoid GPU overflow

# Step 6: Concatenate all embeddings
all_embeddings = torch.cat(all_embeddings, dim=0)

print(f"Done in {(time.time() - start)/60:.2f} minutes.")

  return forward_call(*args, **kwargs)


Done in 26.71 minutes.


In [12]:
torch.save(all_embeddings, "embeddings.pt")

In [1]:
import torch

data = torch.load("embeddings.pt")
print(type(data))
if isinstance(data, dict):
    print(data.keys())
elif isinstance(data, list):
    print("First item:", data[0])


<class 'torch.Tensor'>


In [2]:
import torch

data = torch.load("embeddings.pt")
print(data.shape)


torch.Size([129400, 384])


In [None]:
import json

with open("data_for_vectordb.json", "r", encoding="utf-8") as f:
    records = json.load(f)

print(len(records))       # should match data.shape[0]
print(records[0]["id"])   # check first CVE ID



129400
CVE-1999-0199
