In [5]:
# We want a Dataset class that loads in a jsonl file, tokenizes the dataset as expected and returns the input ids and attention mask.
import os
import json

class JsonlDataset():
  def __init__(self, tokenizer, tokenizer_max_length, batch_size, min_len, dataset_name, dataset_folder, device):
    self.tokenizer = tokenizer
    if tokenizer.pad_token is None:
      tokenizer.pad_token = tokenizer.eos_token

    self.tokenizer_max_length = tokenizer_max_length
    self.batch_size = batch_size
    self.min_len = min_len
    self.dataset_name = dataset_name
    self.dataset_folder = dataset_folder
    self.data = []
    self.device = device

  def __getitem__(self, idx):
    item = self.data[idx]
    input_ids = self.tokenizer(item["text"], return_tensors="pt", padding=True, truncation=True, max_length=self.tokenizer_max_length)
    inputs = {key: value.to(self.device) for key, value in input_ids.items()}
    return {"input_ids": inputs["input_ids"].squeeze(0), "attention_mask": inputs["attention_mask"].squeeze(0)}

  def _load_dataset(self):
    dataset_path = os.path.join(self.dataset_folder, self.dataset_name)
    if not os.path.exists(dataset_path):
      raise FileNotFoundError(f"Dataset file not found at {dataset_path}")
    
    data_list = []
    with open(dataset_path, "r") as f:
      for line in f:
        data = json.loads(line)
        if len(data["text"]) > self.min_len:
          data_list.append(data)

    self.data = data_list
    
  def __len__(self):
    return len(self.data)



In [6]:
import torch

class Model():
  def __init__(self, model, tokenizer, device, seed = 42):
    print("ashwinsreevatsa we got the init")
    self.model = model.to(device)
    self.tokenizer = tokenizer
    self.device = device
    self.seed = seed
    torch.manual_seed(seed)
    self.activations = {}

  def hook_fn(self, module, input, output):
    self.activations["transformer_block_output"] = output[0].detach()
  
  def forward(self, inputs, layer_idx: int, with_grad: bool):
    if layer_idx >= len(self.model.transformer.h):
      raise ValueError(f"Layer index {layer_idx} is out of bounds for the model. The model has {len(self.model.transformer.h)} layers.")
    try:
      hook = self.model.transformer.h[layer_idx].register_forward_hook(self.hook_fn)
      tokenized_inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)

      print("ashwinsreevatsa when tokenized inputs is ending")
      print("tokenized inputs: ", tokenized_inputs)
      # print("tokenized inputs **", **tokenized_inputs)
      if with_grad:
        _ = self.model(**tokenized_inputs)
      else:
        with torch.no_grad():
          _ = self.model(**tokenized_inputs)
      print("ashwinsreevatsa when try is ending")
    except Exception as e:
      print("ashwinsreevatsa exception: ", e)
    finally:
      hook.remove()
    print(self.activations)
    return self.activations["transformer_block_output"]

In [7]:
import torch
import numpy as np
from torch.utils.data import DataLoader
import copy
from transformers import AutoTokenizer, AutoModelForCausalLM


class RMU:
  def __init__(self, model, tokenizer, device, alpha, lr, c, hidden_dimension_size, ctx_window, min_len, seed = 42):
    self.unlearned_model = Model(model, tokenizer, device, seed)
    self.frozen_model = copy.deepcopy(self.unlearned_model)
    self.tokenizer = tokenizer
    self.retain_datasets = []
    self.forget_datasets = []
    self.device = device
    self.alpha = alpha
    self.lr = lr
    self.c = c
    self.ctx_window = ctx_window
    self.min_len = min_len
    self.seed = seed
    self.hidden_dimension_size = hidden_dimension_size
    np.random.seed(seed)
    torch.manual_seed(seed)
    
  
  def setup(self):
    # Initialize u
    u = torch.randn(self.hidden_dimension_size)
    u = u / torch.norm(u)
    self.u = u
    
    cyber_forget = JsonlDataset(
      tokenizer=self.tokenizer, tokenizer_max_length=self.ctx_window, batch_size=1,
      min_len=self.min_len, dataset_name="cyber-forget-corpus.jsonl", dataset_folder="data/", device=self.device
    )
    cyber_forget._load_dataset()
    cyber_retain = JsonlDataset(
      tokenizer=self.tokenizer, tokenizer_max_length=self.ctx_window, batch_size=1,
      min_len=self.min_len, dataset_name="cyber-retain-corpus.jsonl", dataset_folder="data/", device=self.device
      )
    cyber_retain._load_dataset()

    # TODO: make sure these datasets are the same size?
    self.retain_datasets.append(cyber_retain)
    self.forget_datasets.append(cyber_forget)



  def rmu_step(self, layer_idx: int):
    print("Beginning RMU step...")
    


    # TODO: Freeze the model parameters at a given layer
    for i in range(len(self.forget_datasets[0].data)):
      print(self.forget_datasets[0].data[i]["text"])
      print("length: ", len(self.forget_datasets))
      print("length of data: ", len(self.forget_datasets[0].data))
      print(type(self.forget_datasets[0][i]))
      break

    act = self.unlearned_model.forward(self.forget_datasets[0].data[0]["text"], layer_idx, False)
    

    print("Finished RMU step...")

print("Begin main.")
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
if torch.cuda.is_available():
  device = torch.device("cuda")
elif torch.backends.mps.is_available():
  device = torch.device("mps")
else:
  device = torch.device("cpu")



rmu_class = RMU(model, tokenizer, device, 1200, 5e-5, 6.5, 768, 50, 42)
rmu_class.setup()
rmu_class.rmu_step(7)
print(rmu_class)

Begin main.
ashwinsreevatsa we got the init


Token indices sequence length is longer than the specified maximum sequence length for this model (13868 > 1024). Running this sequence through the model will result in indexing errors


Beginning RMU step...
#[1]alternate [2]alternate [3]alternate
* [4]Our Services
* [5]Knowledge Centre
* [6]About
* [7]Contact
* [8]Our Services
+ [9]Adversary Simulation
+ [10]Application Security
+ [11]Penetration Testing
+ [12]Response
* [13]Knowledge Centre
+ [14]Insights
+ [15]Research
+ [16]Training
* [17]About
* [18]Contact
(BUTTON)
* Adversary
Adversary Simulation
Our best in class red team can deliver a holistic cyber attack
simulation to provide a true evaluation of your organisation’s
cyber resilience.
* Application Security
Application
Security
Leverage the team behind the industry-leading Web Application and
Mobile Hacker’s Handbook series.
* Penetration Testing
Penetration
Testing
MDSec’s penetration testing team is trusted by companies from the
world’s leading technology firms to global financial institutions.
* Response
Response
Our certified team work with customers at all stages of the
Incident Response lifecycle through our range of proactive and
reactive services.
* 

In [8]:
tokenized_inputs = tokenizer("test", return_tensors="pt").to(device)
print(tokenized_inputs)

{'input_ids': tensor([[9288]], device='mps:0'), 'attention_mask': tensor([[1]], device='mps:0')}


In [9]:
import inspect
print(inspect.getsource(torch))

"""
The torch package contains data structures for multi-dimensional
tensors and defines mathematical operations over these tensors.
Additionally, it provides many utilities for efficient serialization of
Tensors and arbitrary types, and other useful utilities.

It has a CUDA counterpart, that enables you to run your tensor computations
on an NVIDIA GPU with compute capability >= 3.0.
"""

# mypy: allow-untyped-defs

import builtins
import ctypes
import glob
import importlib
import inspect
import math
import os
import platform
import sys
import textwrap
import threading
from typing import (
    Any as _Any,
    Callable as _Callable,
    Dict as _Dict,
    get_origin as _get_origin,
    Optional as _Optional,
    overload as _overload,
    Set as _Set,
    Tuple as _Tuple,
    Type as _Type,
    TYPE_CHECKING,
    TypeVar as _TypeVar,
    Union as _Union,
)
from typing_extensions import ParamSpec as _ParamSpec


if TYPE_CHECKING:
    from .types import IntLikeType


# multipy/deploy is