In [1]:
import torch
from torch import nn
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
import os

In [2]:
def generate_text_pretrained(model, tokenizer, prompt, max_length=50):
    with torch.no_grad():
        # Prepare the prompt
        inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=50)
        input_ids = inputs['input_ids'].to('cuda')
        attention_mask = inputs['attention_mask'].to('cuda')

        # Generate text
        outputs = model.generate(input_ids=input_ids, 
                                 attention_mask=attention_mask, 
                                 max_length=max_length, 
                                 num_return_sequences=1,
                                 pad_token_id=tokenizer.eos_token_id)
        
        # Decode the generated text
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        return generated_text

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt2_model_name = "gpt2-xl"
model = GPT2LMHeadModel.from_pretrained(gpt2_model_name)
tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
model.to(device)
model.eval()

Using pad_token, but it is not set yet.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout)

In [4]:
# Example usage
prompt = "Can you suggest common weaknesses and vulnerabilities related to the Colonial Pipeline Attack? In May of 2021, a hacker group known as DarkSide gained access to Colonial Pipeline’s network through a compromised VPN password. This was possible, in part, because the system did not have multifactor authentication protocols in place. This made entry into the VPN easier since multiple steps were not required to verify the user’s identity. Even though the compromised password was a “complex password,” malicious actors acquired it as part of a separate data breach."
generated_text_pretrained = generate_text_pretrained(model, tokenizer, prompt, max_length=100)
print("Generated text with pretrained GPT-2:", generated_text_pretrained)

2024-08-05 14:52:01.821818: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Generated text with pretrained GPT-2: Can you suggest common weaknesses and vulnerabilities related to the Colonial Pipeline Attack? In May of 2021, a hacker group known as DarkSide gained access to Colonial Pipeline’s network through a compromised VPN password. This was possible, in part, because the company had not implemented a password policy. The DarkSide group then used the compromised VPN password to gain access to the company's network. The DarkSide group then used the company's network to launch a distributed denial of service (DDoS) attack against


In [16]:
def retrieveEmbeddings(input_text, tokenizer, model, device, chunk_size=110):
    # Tokenize the input text to find out if splitting is needed
    tokens = tokenizer.tokenize(input_text)
    
    # Initialize a list to store embeddings from each chunk
    chunk_embeddings = []
    print(chunk_size)
    # Process text in chunks that fit within the model's limit
    for i in range(0, len(tokens), chunk_size):
        if (len(tokens)>chunk_size):
            print("Chunk", i, "for", len(tokens), "tokens.")
        # Convert chunk to tokens and ensure it's within the model's max input size
        chunk_tokens = tokens[i:i + chunk_size]
        input_ids = tokenizer.convert_tokens_to_ids(chunk_tokens)
        input_tensors = torch.tensor([input_ids]).to(device)
        
        # Forward pass, get hidden states for the chunk
        with torch.no_grad():
            outputs = model(input_tensors, output_hidden_states=True)
        
        # Extract the hidden states
        hidden_states = outputs.hidden_states
        last_layer_embeddings = hidden_states[-1]
        
        # Mean pool the embeddings of the last layer across the sequence length dimension
        mean_pooled = last_layer_embeddings.mean(dim=1)
        chunk_embeddings.append(mean_pooled)
    
    # Concatenate embeddings from all chunks along the batch dimension
    # and then take the mean across the concatenated dimension to get a single embedding
    # print(len(chunk_embeddings))
    all_embeddings = torch.cat(chunk_embeddings, dim=0)
    # print(all_embeddings[0][:10])
    aggregated_embedding = all_embeddings.mean(dim=0)
    # print(aggregated_embedding[:10])
    return aggregated_embedding.squeeze()

In [17]:
embd = retrieveEmbeddings(prompt, tokenizer, model, device)

110


In [18]:
embd.shape

torch.Size([1600])

In [19]:
emb_np = embd.detach().cpu().numpy()

In [20]:
import numpy as np
np.save("data/prompt_embeddings.npy", emb_np)